[llvm] cc39c3b - [Codegen][LegalizeIntegerTypes] New legalization strategy for scalar shifts: shift through stack

Roman Lebedev via llvm-commits llvm-commits at lists.llvm.org
Sat Jan 14 08:12:59 PST 2023


Author: Roman Lebedev
Date: 2023-01-14T19:12:18+03:00
New Revision: cc39c3b17fb2598e20ca0854f9fe6d69169d85c7

URL: https://github.com/llvm/llvm-project/commit/cc39c3b17fb2598e20ca0854f9fe6d69169d85c7
DIFF: https://github.com/llvm/llvm-project/commit/cc39c3b17fb2598e20ca0854f9fe6d69169d85c7.diff

LOG: [Codegen][LegalizeIntegerTypes] New legalization strategy for scalar shifts: shift through stack

https://reviews.llvm.org/D140493 is going to teach SROA how to promote allocas
that have variably-indexed loads. That does bring up questions of cost model,
since that requires creating wide shifts.

Indeed, our legalization for them is not optimal.
We either split it into parts, or lower it into a libcall.
But if the shift amount is by a multiple of CHAR_BIT,
we can also legalize it throught stack.

The basic idea is very simple:
1. Get a stack slot 2x the width of the shift type
2. store the value we are shifting into one half of the slot
3. pad the other half of the slot. for logical shifts, with zero, for arithmetic shift with signbit
4. index into the slot (starting from the base half into which we spilled, either upwards or downwards)
5. load
6. split loaded integer

This works for both little-endian and big-endian machines:
https://alive2.llvm.org/ce/z/YNVwd5

And better yet, if the original shift amount was not a multiple of CHAR_BIT,
we can just shift by that remainder afterwards: https://alive2.llvm.org/ce/z/pz5G-K

I think, if we are going perform shift->shift-by-parts expansion more than once,
we should instead go through stack, which is what this patch does.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D140638

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
    llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/lib/Target/ARM/ARMISelLowering.h
    llvm/lib/Target/AVR/AVRISelLowering.h
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.h
    llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
    llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
    llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
    llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
    llvm/test/CodeGen/Mips/llvm-ir/shl.ll
    llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
    llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
    llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
    llvm/test/CodeGen/RISCV/shifts.ll
    llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
    llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
    llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
    llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
    llvm/test/CodeGen/X86/scheduler-backtracking.ll
    llvm/test/CodeGen/X86/shift-i128.ll
    llvm/test/CodeGen/X86/shift-i256.ll
    llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
    llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
    llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
    llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 92c58e0a767d4..aa85a78fe6ad7 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -915,11 +915,19 @@ class TargetLoweringBase {
     return RepRegClassCostForVT[VT.SimpleTy];
   }
 
-  /// Return true if SHIFT instructions should be expanded to SHIFT_PARTS
-  /// instructions, and false if a library call is preferred (e.g for code-size
-  /// reasons).
-  virtual bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
-    return true;
+  /// Return the preferred strategy to legalize tihs SHIFT instruction, with
+  /// \p ExpansionFactor being the recursion depth - how many expansion needed.
+  enum class ShiftLegalizationStrategy {
+    ExpandToParts,
+    ExpandThroughStack,
+    LowerToLibcall
+  };
+  virtual ShiftLegalizationStrategy
+  preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
+                                     unsigned ExpansionFactor) const {
+    if (ExpansionFactor == 1)
+      return ShiftLegalizationStrategy::ExpandToParts;
+    return ShiftLegalizationStrategy::ExpandThroughStack;
   }
 
   /// Return true if the target has native support for the specified value type.

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 4e640247a5bf7..c9ce9071a25dd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4140,6 +4140,111 @@ void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
   SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
+                                                      SDValue &Hi) {
+  SDLoc dl(N);
+  SDValue Shiftee = N->getOperand(0);
+  EVT VT = Shiftee.getValueType();
+  SDValue ShAmt = N->getOperand(1);
+  EVT ShAmtVT = ShAmt.getValueType();
+
+  // This legalization is optimal when the shift is by a multiple of byte width,
+  //   %x * 8 <-> %x << 3   so 3 low bits should be be known zero.
+  bool ShiftByByteMultiple =
+      DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= 3;
+
+  // If we can't do it as one step, we'll have two uses of shift amount,
+  // and thus must freeze it.
+  if (!ShiftByByteMultiple)
+    ShAmt = DAG.getFreeze(ShAmt);
+
+  unsigned VTBitWidth = VT.getScalarSizeInBits();
+  assert(VTBitWidth % 8 == 0 && "Shifting a not byte multiple value?");
+  unsigned VTByteWidth = VTBitWidth / 8;
+  assert(isPowerOf2_32(VTByteWidth) &&
+         "Shiftee type size is not a power of two!");
+  unsigned StackSlotByteWidth = 2 * VTByteWidth;
+  unsigned StackSlotBitWidth = 8 * StackSlotByteWidth;
+  EVT StackSlotVT = EVT::getIntegerVT(*DAG.getContext(), StackSlotBitWidth);
+
+  // Get a temporary stack slot 2x the width of our VT.
+  // FIXME: reuse stack slots?
+  // FIXME: should we be more picky about alignment?
+  Align StackSlotAlignment(1);
+  SDValue StackPtr = DAG.CreateStackTemporary(
+      TypeSize::getFixed(StackSlotByteWidth), StackSlotAlignment);
+  EVT PtrTy = StackPtr.getValueType();
+  SDValue Ch = DAG.getEntryNode();
+
+  MachinePointerInfo StackPtrInfo = MachinePointerInfo::getFixedStack(
+      DAG.getMachineFunction(),
+      cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex());
+
+  // Extend the value, that is being shifted, to the entire stack slot's width.
+  SDValue Init;
+  if (N->getOpcode() != ISD::SHL) {
+    unsigned WideningOpc =
+        N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    Init = DAG.getNode(WideningOpc, dl, StackSlotVT, Shiftee);
+  } else {
+    // For left-shifts, pad the Shiftee's LSB with zeros to twice it's width.
+    SDValue AllZeros = DAG.getConstant(0, dl, VT);
+    Init = DAG.getNode(ISD::BUILD_PAIR, dl, StackSlotVT, AllZeros, Shiftee);
+  }
+  // And spill it into the stack slot.
+  Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo, StackSlotAlignment);
+
+  // Now, compute the full-byte offset into stack slot from where we can load.
+  // We have shift amount, which is in bits, but in multiples of byte.
+  // So just divide by CHAR_BIT.
+  SDNodeFlags Flags;
+  if (ShiftByByteMultiple)
+    Flags.setExact(true);
+  SDValue ByteOffset = DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt,
+                                   DAG.getConstant(3, dl, ShAmtVT), Flags);
+  // And clamp it, because OOB load is an immediate UB,
+  // while shift overflow would have *just* been poison.
+  ByteOffset = DAG.getNode(ISD::AND, dl, ShAmtVT, ByteOffset,
+                           DAG.getConstant(VTByteWidth - 1, dl, ShAmtVT));
+  // We have exactly two strategies on indexing into stack slot here:
+  // 1. upwards starting from the beginning of the slot
+  // 2. downwards starting from the middle of the slot
+  // On little-endian machine, we pick 1. for right shifts and 2. for left-shift
+  // and vice versa on big-endian machine.
+  bool WillIndexUpwards = N->getOpcode() != ISD::SHL;
+  if (DAG.getDataLayout().isBigEndian())
+    WillIndexUpwards = !WillIndexUpwards;
+
+  SDValue AdjStackPtr;
+  if (WillIndexUpwards) {
+    AdjStackPtr = StackPtr;
+  } else {
+    AdjStackPtr = DAG.getMemBasePlusOffset(
+        StackPtr, DAG.getConstant(VTByteWidth, dl, PtrTy), dl);
+    ByteOffset = DAG.getNegative(ByteOffset, dl, ShAmtVT);
+  }
+
+  // Get the pointer somewhere into the stack slot from which we need to load.
+  ByteOffset = DAG.getSExtOrTrunc(ByteOffset, dl, PtrTy);
+  AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, ByteOffset, dl);
+
+  // And load it! While the load is not legal, legalizing it is obvious.
+  SDValue Res = DAG.getLoad(
+      VT, dl, Ch, AdjStackPtr,
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), Align(1));
+  // We've performed the shift by a CHAR_BIT * [_ShAmt / CHAR_BIT_]
+
+  // If we may still have a less-than-CHAR_BIT to shift by, do so now.
+  if (!ShiftByByteMultiple) {
+    SDValue ShAmtRem = DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
+                                   DAG.getConstant(7, dl, ShAmtVT));
+    Res = DAG.getNode(N->getOpcode(), dl, VT, Res, ShAmtRem);
+  }
+
+  // Finally, split the computed value.
+  SplitInteger(Res, Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
@@ -4175,7 +4280,24 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
     (Action == TargetLowering::Legal && TLI.isTypeLegal(NVT)) ||
     Action == TargetLowering::Custom;
 
-  if (LegalOrCustom && TLI.shouldExpandShift(DAG, N)) {
+  unsigned ExpansionFactor = 1;
+  // That VT->NVT expansion is one step. But will we re-expand NVT?
+  for (EVT TmpVT = NVT;;) {
+    EVT NewTMPVT = TLI.getTypeToTransformTo(*DAG.getContext(), TmpVT);
+    if (NewTMPVT == TmpVT)
+      break;
+    TmpVT = NewTMPVT;
+    ++ExpansionFactor;
+  }
+
+  TargetLowering::ShiftLegalizationStrategy S =
+      TLI.preferredShiftLegalizationStrategy(DAG, N, ExpansionFactor);
+
+  if (S == TargetLowering::ShiftLegalizationStrategy::ExpandThroughStack)
+    return ExpandIntRes_ShiftThroughStack(N, Lo, Hi);
+
+  if (LegalOrCustom &&
+      S != TargetLowering::ShiftLegalizationStrategy::LowerToLibcall) {
     // Expand the subcomponents.
     SDValue LHSL, LHSH;
     GetExpandedInteger(N->getOperand(0), LHSL, LHSH);

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 4dea7d663af13..b97e44a013196 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -457,6 +457,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void ExpandIntRes_SREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_UDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_UREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ShiftThroughStack (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_Shift             (SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void ExpandIntRes_MINMAX            (SDNode *N, SDValue &Lo, SDValue &Hi);

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1bfce0d993fd2..476a3ef7a5555 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22545,12 +22545,14 @@ bool AArch64TargetLowering::
   return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
 }
 
-bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
-                                              SDNode *N) const {
+TargetLowering::ShiftLegalizationStrategy
+AArch64TargetLowering::preferredShiftLegalizationStrategy(
+    SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
   if (DAG.getMachineFunction().getFunction().hasMinSize() &&
       !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
-    return false;
-  return true;
+    return ShiftLegalizationStrategy::LowerToLibcall;
+  return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
+                                                            ExpansionFactor);
 }
 
 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e62a1bf2ec0b0..9cf99b3081217 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -799,7 +799,9 @@ class AArch64TargetLowering : public TargetLowering {
       unsigned OldShiftOpcode, unsigned NewShiftOpcode,
       SelectionDAG &DAG) const override;
 
-  bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
+  ShiftLegalizationStrategy
+  preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
+                                     unsigned ExpansionFactor) const override;
 
   bool shouldTransformSignedTruncationCheck(EVT XVT,
                                             unsigned KeptBits) const override {

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index dd68153f14182..1afeeaff8f0b9 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21233,8 +21233,13 @@ bool ARMTargetLowering::isMaskAndCmp0FoldingBeneficial(
                                 : ARM_AM::getSOImmVal(MaskVal)) != -1;
 }
 
-bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
-  return !Subtarget->hasMinSize() || Subtarget->isTargetWindows();
+TargetLowering::ShiftLegalizationStrategy
+ARMTargetLowering::preferredShiftLegalizationStrategy(
+    SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
+  if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
+    return ShiftLegalizationStrategy::LowerToLibcall;
+  return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
+                                                            ExpansionFactor);
 }
 
 Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 41e66739f571c..06da9977f8922 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -697,7 +697,9 @@ class VectorType;
       return HasStandaloneRem;
     }
 
-    bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
+    ShiftLegalizationStrategy
+    preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
+                                       unsigned ExpansionFactor) const override;
 
     CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const;
     CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const;

diff  --git a/llvm/lib/Target/AVR/AVRISelLowering.h b/llvm/lib/Target/AVR/AVRISelLowering.h
index 74092ca547e73..80d94dc188a50 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.h
+++ b/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -147,6 +147,12 @@ class AVRTargetLowering : public TargetLowering {
     return false;
   }
 
+  ShiftLegalizationStrategy
+  preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
+                                     unsigned ExpansionFactor) const override {
+    return ShiftLegalizationStrategy::LowerToLibcall;
+  }
+
 private:
   SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc,
                     SelectionDAG &DAG, SDLoc dl) const;

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 3de2e4dd02328..91f1a0f2e3439 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -483,11 +483,15 @@ class RISCVTargetLowering : public TargetLowering {
     return ISD::SIGN_EXTEND;
   }
 
-  bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
+  TargetLowering::ShiftLegalizationStrategy
+  preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
+                                     unsigned ExpansionFactor) const override {
     if (DAG.getMachineFunction().getFunction().hasMinSize())
-      return false;
-    return true;
+      return ShiftLegalizationStrategy::LowerToLibcall;
+    return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
+                                                              ExpansionFactor);
   }
+
   bool isDesirableToCommuteWithShift(const SDNode *N,
                                      CombineLevel Level) const override;
 

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fa32b604e8dd9..9b98f901a2b22 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6045,12 +6045,14 @@ bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
   return true;
 }
 
-bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
-                                          SDNode *N) const {
+TargetLowering::ShiftLegalizationStrategy
+X86TargetLowering::preferredShiftLegalizationStrategy(
+    SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
   if (DAG.getMachineFunction().getFunction().hasMinSize() &&
       !Subtarget.isOSWindows())
-    return false;
-  return true;
+    return ShiftLegalizationStrategy::LowerToLibcall;
+  return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
+                                                            ExpansionFactor);
 }
 
 bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 774d750f80c2b..7d9b2e1c3ea52 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1114,7 +1114,9 @@ namespace llvm {
       return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
     }
 
-    bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
+    ShiftLegalizationStrategy
+    preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
+                                       unsigned ExpansionFactor) const override;
 
     bool shouldSplatInsEltVarIndex(EVT VT) const override;
 

diff  --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
index b909fd3229f70..53f0d784ceed1 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -179,62 +179,22 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: lshr_32bytes:
 ; ALL:       // %bb.0:
+; ALL-NEXT:    sub sp, sp, #64
 ; ALL-NEXT:    ldr x9, [x1]
-; ALL-NEXT:    mov w8, #128
-; ALL-NEXT:    ldp x11, x10, [x0, #8]
-; ALL-NEXT:    lsl x9, x9, #3
-; ALL-NEXT:    ldr x12, [x0]
-; ALL-NEXT:    sub x8, x8, x9
-; ALL-NEXT:    ldr x13, [x0, #24]
-; ALL-NEXT:    and x17, x8, #0x38
-; ALL-NEXT:    mvn w0, w8
-; ALL-NEXT:    lsr x14, x10, #1
-; ALL-NEXT:    and x15, x9, #0x38
-; ALL-NEXT:    mvn w16, w9
-; ALL-NEXT:    tst x8, #0x40
-; ALL-NEXT:    lsl x3, x13, x17
-; ALL-NEXT:    lsr x14, x14, x0
-; ALL-NEXT:    lsl x17, x10, x17
-; ALL-NEXT:    orr x14, x3, x14
-; ALL-NEXT:    lsl x18, x13, #1
-; ALL-NEXT:    csel x0, xzr, x17, ne
-; ALL-NEXT:    csel x14, x17, x14, ne
-; ALL-NEXT:    lsl x17, x11, #1
-; ALL-NEXT:    lsr x8, x10, x15
-; ALL-NEXT:    lsl x1, x18, x16
-; ALL-NEXT:    lsr x3, x12, x15
-; ALL-NEXT:    lsl x16, x17, x16
-; ALL-NEXT:    orr x8, x1, x8
-; ALL-NEXT:    lsr x1, x13, x15
-; ALL-NEXT:    tst x9, #0x40
-; ALL-NEXT:    orr x16, x16, x3
-; ALL-NEXT:    lsr x15, x11, x15
-; ALL-NEXT:    csel x8, x1, x8, ne
-; ALL-NEXT:    csel x16, x15, x16, ne
-; ALL-NEXT:    csel x15, xzr, x15, ne
-; ALL-NEXT:    csel x17, xzr, x1, ne
-; ALL-NEXT:    subs x1, x9, #128
-; ALL-NEXT:    and x3, x1, #0x38
-; ALL-NEXT:    mvn w4, w1
-; ALL-NEXT:    csel x17, x17, xzr, lo
-; ALL-NEXT:    tst x1, #0x40
-; ALL-NEXT:    orr x16, x16, x0
-; ALL-NEXT:    orr x14, x15, x14
-; ALL-NEXT:    lsr x10, x10, x3
-; ALL-NEXT:    lsl x18, x18, x4
-; ALL-NEXT:    orr x10, x18, x10
-; ALL-NEXT:    lsr x13, x13, x3
-; ALL-NEXT:    csel x10, x13, x10, ne
-; ALL-NEXT:    csel x13, xzr, x13, ne
-; ALL-NEXT:    cmp x9, #128
-; ALL-NEXT:    csel x10, x16, x10, lo
-; ALL-NEXT:    csel x8, x8, xzr, lo
-; ALL-NEXT:    csel x13, x14, x13, lo
-; ALL-NEXT:    cmp x9, #0
-; ALL-NEXT:    csel x9, x12, x10, eq
-; ALL-NEXT:    csel x10, x11, x13, eq
-; ALL-NEXT:    stp x8, x17, [x2, #16]
-; ALL-NEXT:    stp x9, x10, [x2]
+; ALL-NEXT:    mov x8, sp
+; ALL-NEXT:    ldp x10, x11, [x0, #16]
+; ALL-NEXT:    movi v0.2d, #0000000000000000
+; ALL-NEXT:    ldr q1, [x0]
+; ALL-NEXT:    and x9, x9, #0x1f
+; ALL-NEXT:    add x8, x8, x9
+; ALL-NEXT:    stp q0, q0, [sp, #32]
+; ALL-NEXT:    stp x10, x11, [sp, #16]
+; ALL-NEXT:    str q1, [sp]
+; ALL-NEXT:    ldp x10, x9, [x8, #16]
+; ALL-NEXT:    ldr q0, [x8]
+; ALL-NEXT:    stp x10, x9, [x2, #16]
+; ALL-NEXT:    str q0, [x2]
+; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -246,62 +206,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: shl_32bytes:
 ; ALL:       // %bb.0:
+; ALL-NEXT:    sub sp, sp, #64
 ; ALL-NEXT:    ldr x9, [x1]
-; ALL-NEXT:    mov w8, #128
-; ALL-NEXT:    ldp x10, x11, [x0, #8]
-; ALL-NEXT:    lsl x9, x9, #3
-; ALL-NEXT:    ldr x12, [x0, #24]
+; ALL-NEXT:    mov x8, sp
+; ALL-NEXT:    ldp x10, x11, [x0, #16]
+; ALL-NEXT:    movi v0.2d, #0000000000000000
+; ALL-NEXT:    add x8, x8, #32
+; ALL-NEXT:    ldr q1, [x0]
+; ALL-NEXT:    and x9, x9, #0x1f
 ; ALL-NEXT:    sub x8, x8, x9
-; ALL-NEXT:    ldr x13, [x0]
-; ALL-NEXT:    and x17, x8, #0x38
-; ALL-NEXT:    mvn w0, w8
-; ALL-NEXT:    lsl x14, x10, #1
-; ALL-NEXT:    and x15, x9, #0x38
-; ALL-NEXT:    mvn w16, w9
-; ALL-NEXT:    tst x8, #0x40
-; ALL-NEXT:    lsr x3, x13, x17
-; ALL-NEXT:    lsl x14, x14, x0
-; ALL-NEXT:    lsr x17, x10, x17
-; ALL-NEXT:    orr x14, x14, x3
-; ALL-NEXT:    lsr x18, x13, #1
-; ALL-NEXT:    csel x0, xzr, x17, ne
-; ALL-NEXT:    csel x14, x17, x14, ne
-; ALL-NEXT:    lsr x17, x11, #1
-; ALL-NEXT:    lsl x8, x10, x15
-; ALL-NEXT:    lsr x1, x18, x16
-; ALL-NEXT:    lsl x3, x12, x15
-; ALL-NEXT:    lsr x16, x17, x16
-; ALL-NEXT:    orr x8, x8, x1
-; ALL-NEXT:    lsl x1, x13, x15
-; ALL-NEXT:    tst x9, #0x40
-; ALL-NEXT:    orr x16, x3, x16
-; ALL-NEXT:    lsl x15, x11, x15
-; ALL-NEXT:    csel x8, x1, x8, ne
-; ALL-NEXT:    csel x16, x15, x16, ne
-; ALL-NEXT:    csel x15, xzr, x15, ne
-; ALL-NEXT:    csel x17, xzr, x1, ne
-; ALL-NEXT:    subs x1, x9, #128
-; ALL-NEXT:    and x3, x1, #0x38
-; ALL-NEXT:    mvn w4, w1
-; ALL-NEXT:    csel x17, x17, xzr, lo
-; ALL-NEXT:    tst x1, #0x40
-; ALL-NEXT:    orr x16, x16, x0
-; ALL-NEXT:    orr x14, x15, x14
-; ALL-NEXT:    lsl x10, x10, x3
-; ALL-NEXT:    lsr x18, x18, x4
-; ALL-NEXT:    orr x10, x10, x18
-; ALL-NEXT:    lsl x13, x13, x3
-; ALL-NEXT:    csel x10, x13, x10, ne
-; ALL-NEXT:    csel x13, xzr, x13, ne
-; ALL-NEXT:    cmp x9, #128
-; ALL-NEXT:    csel x10, x16, x10, lo
-; ALL-NEXT:    csel x8, x8, xzr, lo
-; ALL-NEXT:    csel x13, x14, x13, lo
-; ALL-NEXT:    cmp x9, #0
-; ALL-NEXT:    csel x9, x12, x10, eq
-; ALL-NEXT:    csel x10, x11, x13, eq
-; ALL-NEXT:    stp x17, x8, [x2]
-; ALL-NEXT:    stp x10, x9, [x2, #16]
+; ALL-NEXT:    stp q0, q0, [sp]
+; ALL-NEXT:    stp x10, x11, [sp, #48]
+; ALL-NEXT:    str q1, [sp, #32]
+; ALL-NEXT:    ldp x9, x10, [x8, #16]
+; ALL-NEXT:    ldr q0, [x8]
+; ALL-NEXT:    stp x9, x10, [x2, #16]
+; ALL-NEXT:    str q0, [x2]
+; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -313,63 +234,23 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: ashr_32bytes:
 ; ALL:       // %bb.0:
+; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    ldp x11, x10, [x0, #16]
+; ALL-NEXT:    mov x8, sp
 ; ALL-NEXT:    ldr x9, [x1]
-; ALL-NEXT:    mov w8, #128
-; ALL-NEXT:    ldp x11, x10, [x0, #8]
-; ALL-NEXT:    lsl x9, x9, #3
-; ALL-NEXT:    ldr x12, [x0]
-; ALL-NEXT:    sub x8, x8, x9
-; ALL-NEXT:    ldr x13, [x0, #24]
-; ALL-NEXT:    and x18, x8, #0x38
-; ALL-NEXT:    mvn w0, w8
-; ALL-NEXT:    lsr x14, x10, #1
-; ALL-NEXT:    and x15, x9, #0x38
-; ALL-NEXT:    mvn w16, w9
-; ALL-NEXT:    lsl x17, x13, #1
-; ALL-NEXT:    lsl x4, x13, x18
-; ALL-NEXT:    lsr x14, x14, x0
-; ALL-NEXT:    tst x8, #0x40
-; ALL-NEXT:    lsl x18, x10, x18
-; ALL-NEXT:    orr x14, x4, x14
-; ALL-NEXT:    lsr x8, x10, x15
-; ALL-NEXT:    lsl x1, x17, x16
-; ALL-NEXT:    csel x0, xzr, x18, ne
-; ALL-NEXT:    csel x14, x18, x14, ne
-; ALL-NEXT:    lsl x18, x11, #1
-; ALL-NEXT:    orr x8, x1, x8
-; ALL-NEXT:    lsr x1, x12, x15
-; ALL-NEXT:    lsl x16, x18, x16
-; ALL-NEXT:    asr x3, x13, x15
-; ALL-NEXT:    tst x9, #0x40
-; ALL-NEXT:    orr x16, x16, x1
-; ALL-NEXT:    lsr x15, x11, x15
-; ALL-NEXT:    asr x18, x13, #63
-; ALL-NEXT:    csel x8, x3, x8, ne
-; ALL-NEXT:    csel x16, x15, x16, ne
-; ALL-NEXT:    csel x15, xzr, x15, ne
-; ALL-NEXT:    csel x1, x18, x3, ne
-; ALL-NEXT:    subs x3, x9, #128
-; ALL-NEXT:    orr x16, x16, x0
-; ALL-NEXT:    and x4, x3, #0x38
-; ALL-NEXT:    mvn w5, w3
-; ALL-NEXT:    orr x14, x15, x14
-; ALL-NEXT:    lsr x10, x10, x4
-; ALL-NEXT:    lsl x17, x17, x5
-; ALL-NEXT:    orr x10, x17, x10
-; ALL-NEXT:    csel x17, x1, x18, lo
-; ALL-NEXT:    asr x13, x13, x4
-; ALL-NEXT:    tst x3, #0x40
-; ALL-NEXT:    csel x10, x13, x10, ne
-; ALL-NEXT:    csel x13, x18, x13, ne
-; ALL-NEXT:    cmp x9, #128
-; ALL-NEXT:    csel x10, x16, x10, lo
-; ALL-NEXT:    csel x8, x8, x18, lo
-; ALL-NEXT:    csel x13, x14, x13, lo
-; ALL-NEXT:    cmp x9, #0
-; ALL-NEXT:    csel x9, x12, x10, eq
-; ALL-NEXT:    csel x10, x11, x13, eq
-; ALL-NEXT:    stp x8, x17, [x2, #16]
-; ALL-NEXT:    stp x9, x10, [x2]
+; ALL-NEXT:    ldr q0, [x0]
+; ALL-NEXT:    asr x12, x10, #63
+; ALL-NEXT:    and x9, x9, #0x1f
+; ALL-NEXT:    add x8, x8, x9
+; ALL-NEXT:    stp x11, x10, [sp, #16]
+; ALL-NEXT:    str q0, [sp]
+; ALL-NEXT:    stp x12, x12, [sp, #48]
+; ALL-NEXT:    stp x12, x12, [sp, #32]
+; ALL-NEXT:    ldp x10, x9, [x8, #16]
+; ALL-NEXT:    ldr q0, [x8]
+; ALL-NEXT:    stp x10, x9, [x2, #16]
+; ALL-NEXT:    str q0, [x2]
+; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1

diff  --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
index c9caa58ac584f..710ff9ac701de 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
@@ -158,57 +158,39 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: lshr_32bytes:
 ; ALL:       // %bb.0:
+; ALL-NEXT:    sub sp, sp, #64
 ; ALL-NEXT:    ldr x9, [x1]
-; ALL-NEXT:    mov w8, #128
+; ALL-NEXT:    mov x8, sp
 ; ALL-NEXT:    ldp x10, x11, [x0, #16]
-; ALL-NEXT:    sub x8, x8, x9
-; ALL-NEXT:    mvn w16, w9
-; ALL-NEXT:    ldp x13, x12, [x0]
-; ALL-NEXT:    mvn w0, w8
-; ALL-NEXT:    lsr x14, x10, #1
-; ALL-NEXT:    lsl x1, x11, x8
-; ALL-NEXT:    tst x8, #0x40
-; ALL-NEXT:    lsl x8, x10, x8
-; ALL-NEXT:    lsl x17, x11, #1
-; ALL-NEXT:    lsr x14, x14, x0
-; ALL-NEXT:    csel x0, xzr, x8, ne
-; ALL-NEXT:    orr x14, x1, x14
-; ALL-NEXT:    lsr x15, x10, x9
-; ALL-NEXT:    csel x8, x8, x14, ne
-; ALL-NEXT:    lsl x14, x12, #1
-; ALL-NEXT:    lsl x3, x17, x16
-; ALL-NEXT:    lsr x1, x13, x9
-; ALL-NEXT:    lsl x14, x14, x16
-; ALL-NEXT:    lsr x18, x11, x9
-; ALL-NEXT:    orr x15, x3, x15
-; ALL-NEXT:    tst x9, #0x40
-; ALL-NEXT:    orr x14, x14, x1
-; ALL-NEXT:    lsr x16, x12, x9
-; ALL-NEXT:    csel x15, x18, x15, ne
-; ALL-NEXT:    csel x14, x16, x14, ne
-; ALL-NEXT:    csel x16, xzr, x16, ne
-; ALL-NEXT:    csel x18, xzr, x18, ne
-; ALL-NEXT:    subs x1, x9, #128
-; ALL-NEXT:    orr x14, x14, x0
-; ALL-NEXT:    mvn w3, w1
-; ALL-NEXT:    orr x8, x16, x8
-; ALL-NEXT:    lsr x10, x10, x1
-; ALL-NEXT:    lsr x11, x11, x1
-; ALL-NEXT:    lsl x17, x17, x3
-; ALL-NEXT:    orr x10, x17, x10
-; ALL-NEXT:    csel x17, x18, xzr, lo
-; ALL-NEXT:    tst x1, #0x40
-; ALL-NEXT:    csel x10, x11, x10, ne
-; ALL-NEXT:    csel x11, xzr, x11, ne
-; ALL-NEXT:    cmp x9, #128
-; ALL-NEXT:    csel x10, x14, x10, lo
-; ALL-NEXT:    csel x14, x15, xzr, lo
-; ALL-NEXT:    csel x8, x8, x11, lo
-; ALL-NEXT:    cmp x9, #0
-; ALL-NEXT:    csel x9, x13, x10, eq
-; ALL-NEXT:    csel x8, x12, x8, eq
-; ALL-NEXT:    stp x14, x17, [x2, #16]
-; ALL-NEXT:    stp x9, x8, [x2]
+; ALL-NEXT:    movi v0.2d, #0000000000000000
+; ALL-NEXT:    ldr q1, [x0]
+; ALL-NEXT:    ubfx x12, x9, #3, #5
+; ALL-NEXT:    add x8, x8, x12
+; ALL-NEXT:    and x9, x9, #0x7
+; ALL-NEXT:    stp q0, q0, [sp, #32]
+; ALL-NEXT:    stp x10, x11, [sp, #16]
+; ALL-NEXT:    eor x11, x9, #0x3f
+; ALL-NEXT:    str q1, [sp]
+; ALL-NEXT:    ldp x10, x13, [x8, #8]
+; ALL-NEXT:    ldr x12, [x8, #24]
+; ALL-NEXT:    ldr x8, [x8]
+; ALL-NEXT:    lsl x14, x10, #1
+; ALL-NEXT:    lsr x10, x10, x9
+; ALL-NEXT:    lsl x15, x12, #1
+; ALL-NEXT:    lsl x14, x14, x11
+; ALL-NEXT:    lsl x11, x15, x11
+; ALL-NEXT:    mvn w15, w9
+; ALL-NEXT:    lsr x8, x8, x9
+; ALL-NEXT:    lsr x12, x12, x9
+; ALL-NEXT:    lsr x9, x13, x9
+; ALL-NEXT:    orr x8, x8, x14
+; ALL-NEXT:    orr x9, x9, x11
+; ALL-NEXT:    lsl x11, x13, #1
+; ALL-NEXT:    lsl x11, x11, x15
+; ALL-NEXT:    orr x10, x10, x11
+; ALL-NEXT:    stp x9, x12, [x2, #16]
+; ALL-NEXT:    stp x8, x10, [x2]
+; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -219,57 +201,40 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: shl_32bytes:
 ; ALL:       // %bb.0:
+; ALL-NEXT:    sub sp, sp, #64
 ; ALL-NEXT:    ldr x9, [x1]
-; ALL-NEXT:    mov w8, #128
-; ALL-NEXT:    ldp x11, x10, [x0]
-; ALL-NEXT:    sub x8, x8, x9
-; ALL-NEXT:    mvn w16, w9
-; ALL-NEXT:    ldp x12, x13, [x0, #16]
-; ALL-NEXT:    mvn w0, w8
-; ALL-NEXT:    tst x8, #0x40
-; ALL-NEXT:    lsl x14, x10, #1
-; ALL-NEXT:    lsr x1, x11, x8
-; ALL-NEXT:    lsr x8, x10, x8
-; ALL-NEXT:    lsr x17, x11, #1
-; ALL-NEXT:    lsl x14, x14, x0
-; ALL-NEXT:    csel x0, xzr, x8, ne
-; ALL-NEXT:    orr x14, x14, x1
-; ALL-NEXT:    lsl x15, x10, x9
-; ALL-NEXT:    csel x8, x8, x14, ne
-; ALL-NEXT:    lsr x14, x12, #1
-; ALL-NEXT:    lsr x3, x17, x16
-; ALL-NEXT:    lsl x1, x13, x9
-; ALL-NEXT:    lsr x14, x14, x16
-; ALL-NEXT:    lsl x18, x11, x9
-; ALL-NEXT:    orr x15, x15, x3
-; ALL-NEXT:    tst x9, #0x40
-; ALL-NEXT:    orr x14, x1, x14
-; ALL-NEXT:    lsl x16, x12, x9
-; ALL-NEXT:    csel x15, x18, x15, ne
-; ALL-NEXT:    csel x14, x16, x14, ne
-; ALL-NEXT:    csel x16, xzr, x16, ne
-; ALL-NEXT:    csel x18, xzr, x18, ne
-; ALL-NEXT:    subs x1, x9, #128
-; ALL-NEXT:    orr x14, x14, x0
-; ALL-NEXT:    mvn w3, w1
-; ALL-NEXT:    orr x8, x16, x8
-; ALL-NEXT:    lsl x10, x10, x1
-; ALL-NEXT:    lsl x11, x11, x1
-; ALL-NEXT:    lsr x17, x17, x3
-; ALL-NEXT:    orr x10, x10, x17
-; ALL-NEXT:    csel x17, x18, xzr, lo
-; ALL-NEXT:    tst x1, #0x40
-; ALL-NEXT:    csel x10, x11, x10, ne
-; ALL-NEXT:    csel x11, xzr, x11, ne
-; ALL-NEXT:    cmp x9, #128
-; ALL-NEXT:    csel x10, x14, x10, lo
-; ALL-NEXT:    csel x14, x15, xzr, lo
-; ALL-NEXT:    csel x8, x8, x11, lo
-; ALL-NEXT:    cmp x9, #0
-; ALL-NEXT:    csel x9, x13, x10, eq
-; ALL-NEXT:    csel x8, x12, x8, eq
-; ALL-NEXT:    stp x17, x14, [x2]
-; ALL-NEXT:    stp x8, x9, [x2, #16]
+; ALL-NEXT:    mov x8, sp
+; ALL-NEXT:    ldp x10, x11, [x0, #16]
+; ALL-NEXT:    movi v0.2d, #0000000000000000
+; ALL-NEXT:    add x8, x8, #32
+; ALL-NEXT:    ldr q1, [x0]
+; ALL-NEXT:    ubfx x12, x9, #3, #5
+; ALL-NEXT:    sub x8, x8, x12
+; ALL-NEXT:    and x9, x9, #0x7
+; ALL-NEXT:    mvn w12, w9
+; ALL-NEXT:    eor x14, x9, #0x3f
+; ALL-NEXT:    stp q0, q0, [sp]
+; ALL-NEXT:    stp x10, x11, [sp, #48]
+; ALL-NEXT:    str q1, [sp, #32]
+; ALL-NEXT:    ldp x11, x10, [x8, #8]
+; ALL-NEXT:    ldr x13, [x8]
+; ALL-NEXT:    ldr x8, [x8, #24]
+; ALL-NEXT:    lsr x15, x11, #1
+; ALL-NEXT:    lsl x11, x11, x9
+; ALL-NEXT:    lsr x16, x10, #1
+; ALL-NEXT:    lsr x12, x15, x12
+; ALL-NEXT:    lsr x15, x13, #1
+; ALL-NEXT:    lsr x16, x16, x14
+; ALL-NEXT:    lsr x14, x15, x14
+; ALL-NEXT:    lsl x13, x13, x9
+; ALL-NEXT:    lsl x8, x8, x9
+; ALL-NEXT:    lsl x9, x10, x9
+; ALL-NEXT:    orr x11, x11, x14
+; ALL-NEXT:    orr x8, x8, x16
+; ALL-NEXT:    orr x9, x9, x12
+; ALL-NEXT:    stp x13, x11, [x2]
+; ALL-NEXT:    stp x9, x8, [x2, #16]
+; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -280,59 +245,40 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: ashr_32bytes:
 ; ALL:       // %bb.0:
+; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    ldp x11, x10, [x0, #16]
+; ALL-NEXT:    mov x8, sp
 ; ALL-NEXT:    ldr x9, [x1]
-; ALL-NEXT:    mov w8, #128
-; ALL-NEXT:    ldp x11, x10, [x0, #8]
-; ALL-NEXT:    sub x8, x8, x9
-; ALL-NEXT:    ldr x13, [x0, #24]
-; ALL-NEXT:    mvn w18, w8
-; ALL-NEXT:    ldr x12, [x0]
-; ALL-NEXT:    mvn w16, w9
-; ALL-NEXT:    tst x8, #0x40
-; ALL-NEXT:    lsr x14, x10, #1
-; ALL-NEXT:    lsl x1, x13, x8
-; ALL-NEXT:    lsr x14, x14, x18
-; ALL-NEXT:    lsl x8, x10, x8
-; ALL-NEXT:    orr x14, x1, x14
-; ALL-NEXT:    lsl x17, x13, #1
-; ALL-NEXT:    csel x18, xzr, x8, ne
-; ALL-NEXT:    csel x8, x8, x14, ne
-; ALL-NEXT:    lsl x14, x11, #1
-; ALL-NEXT:    lsr x15, x10, x9
-; ALL-NEXT:    lsl x3, x17, x16
-; ALL-NEXT:    lsr x1, x12, x9
-; ALL-NEXT:    lsl x14, x14, x16
-; ALL-NEXT:    asr x0, x13, x9
-; ALL-NEXT:    orr x15, x3, x15
-; ALL-NEXT:    tst x9, #0x40
-; ALL-NEXT:    orr x14, x14, x1
-; ALL-NEXT:    lsr x16, x11, x9
-; ALL-NEXT:    asr x1, x13, #63
-; ALL-NEXT:    csel x15, x0, x15, ne
-; ALL-NEXT:    csel x14, x16, x14, ne
-; ALL-NEXT:    csel x16, xzr, x16, ne
-; ALL-NEXT:    csel x0, x1, x0, ne
-; ALL-NEXT:    subs x3, x9, #128
-; ALL-NEXT:    mvn w4, w3
-; ALL-NEXT:    orr x14, x14, x18
-; ALL-NEXT:    orr x8, x16, x8
-; ALL-NEXT:    lsr x10, x10, x3
-; ALL-NEXT:    asr x13, x13, x3
-; ALL-NEXT:    lsl x17, x17, x4
-; ALL-NEXT:    orr x10, x17, x10
-; ALL-NEXT:    csel x17, x0, x1, lo
-; ALL-NEXT:    tst x3, #0x40
-; ALL-NEXT:    csel x10, x13, x10, ne
-; ALL-NEXT:    csel x13, x1, x13, ne
-; ALL-NEXT:    cmp x9, #128
-; ALL-NEXT:    csel x10, x14, x10, lo
-; ALL-NEXT:    csel x14, x15, x1, lo
-; ALL-NEXT:    csel x8, x8, x13, lo
-; ALL-NEXT:    cmp x9, #0
-; ALL-NEXT:    csel x9, x12, x10, eq
-; ALL-NEXT:    csel x8, x11, x8, eq
-; ALL-NEXT:    stp x14, x17, [x2, #16]
-; ALL-NEXT:    stp x9, x8, [x2]
+; ALL-NEXT:    ldr q0, [x0]
+; ALL-NEXT:    asr x12, x10, #63
+; ALL-NEXT:    stp x11, x10, [sp, #16]
+; ALL-NEXT:    ubfx x10, x9, #3, #5
+; ALL-NEXT:    str q0, [sp]
+; ALL-NEXT:    add x8, x8, x10
+; ALL-NEXT:    and x9, x9, #0x7
+; ALL-NEXT:    stp x12, x12, [sp, #48]
+; ALL-NEXT:    eor x14, x9, #0x3f
+; ALL-NEXT:    stp x12, x12, [sp, #32]
+; ALL-NEXT:    mvn w12, w9
+; ALL-NEXT:    ldp x10, x11, [x8, #8]
+; ALL-NEXT:    ldr x13, [x8, #24]
+; ALL-NEXT:    ldr x8, [x8]
+; ALL-NEXT:    lsl x16, x10, #1
+; ALL-NEXT:    lsl x15, x11, #1
+; ALL-NEXT:    lsl x16, x16, x14
+; ALL-NEXT:    lsl x12, x15, x12
+; ALL-NEXT:    lsl x15, x13, #1
+; ALL-NEXT:    lsl x14, x15, x14
+; ALL-NEXT:    lsr x11, x11, x9
+; ALL-NEXT:    asr x13, x13, x9
+; ALL-NEXT:    lsr x8, x8, x9
+; ALL-NEXT:    lsr x9, x10, x9
+; ALL-NEXT:    orr x11, x11, x14
+; ALL-NEXT:    orr x8, x8, x16
+; ALL-NEXT:    orr x9, x9, x12
+; ALL-NEXT:    stp x11, x13, [x2, #16]
+; ALL-NEXT:    stp x8, x9, [x2]
+; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1

diff  --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
index 8be7100d368bb..47d18b9b5c533 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -396,298 +396,209 @@ entry:
 define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS-LABEL: ashr_i128:
 ; MIPS:       # %bb.0: # %entry
-; MIPS-NEXT:    lw $2, 28($sp)
-; MIPS-NEXT:    addiu $1, $zero, 64
-; MIPS-NEXT:    subu $9, $1, $2
-; MIPS-NEXT:    sllv $10, $5, $9
-; MIPS-NEXT:    andi $13, $9, 32
-; MIPS-NEXT:    andi $3, $2, 32
-; MIPS-NEXT:    addiu $11, $zero, 0
-; MIPS-NEXT:    bnez $13, $BB5_2
-; MIPS-NEXT:    addiu $12, $zero, 0
-; MIPS-NEXT:  # %bb.1: # %entry
-; MIPS-NEXT:    move $12, $10
-; MIPS-NEXT:  $BB5_2: # %entry
-; MIPS-NEXT:    not $8, $2
-; MIPS-NEXT:    bnez $3, $BB5_5
-; MIPS-NEXT:    srlv $14, $6, $2
-; MIPS-NEXT:  # %bb.3: # %entry
-; MIPS-NEXT:    sll $1, $6, 1
-; MIPS-NEXT:    srlv $11, $7, $2
-; MIPS-NEXT:    sllv $1, $1, $8
-; MIPS-NEXT:    or $15, $1, $11
-; MIPS-NEXT:    bnez $13, $BB5_7
-; MIPS-NEXT:    move $11, $14
-; MIPS-NEXT:  # %bb.4: # %entry
-; MIPS-NEXT:    b $BB5_6
-; MIPS-NEXT:    nop
-; MIPS-NEXT:  $BB5_5:
-; MIPS-NEXT:    bnez $13, $BB5_7
-; MIPS-NEXT:    move $15, $14
-; MIPS-NEXT:  $BB5_6: # %entry
-; MIPS-NEXT:    sllv $1, $4, $9
-; MIPS-NEXT:    not $9, $9
-; MIPS-NEXT:    srl $10, $5, 1
-; MIPS-NEXT:    srlv $9, $10, $9
-; MIPS-NEXT:    or $10, $1, $9
-; MIPS-NEXT:  $BB5_7: # %entry
-; MIPS-NEXT:    addiu $24, $2, -64
-; MIPS-NEXT:    sll $13, $4, 1
-; MIPS-NEXT:    srav $14, $4, $24
-; MIPS-NEXT:    andi $1, $24, 32
-; MIPS-NEXT:    bnez $1, $BB5_10
-; MIPS-NEXT:    sra $9, $4, 31
-; MIPS-NEXT:  # %bb.8: # %entry
-; MIPS-NEXT:    srlv $1, $5, $24
-; MIPS-NEXT:    not $24, $24
-; MIPS-NEXT:    sllv $24, $13, $24
-; MIPS-NEXT:    or $25, $24, $1
-; MIPS-NEXT:    move $24, $14
-; MIPS-NEXT:    sltiu $14, $2, 64
-; MIPS-NEXT:    beqz $14, $BB5_12
-; MIPS-NEXT:    nop
-; MIPS-NEXT:  # %bb.9: # %entry
-; MIPS-NEXT:    b $BB5_11
-; MIPS-NEXT:    nop
-; MIPS-NEXT:  $BB5_10:
-; MIPS-NEXT:    move $25, $14
-; MIPS-NEXT:    sltiu $14, $2, 64
-; MIPS-NEXT:    beqz $14, $BB5_12
-; MIPS-NEXT:    move $24, $9
-; MIPS-NEXT:  $BB5_11:
-; MIPS-NEXT:    or $25, $15, $12
-; MIPS-NEXT:  $BB5_12: # %entry
-; MIPS-NEXT:    sltiu $12, $2, 1
-; MIPS-NEXT:    beqz $12, $BB5_18
-; MIPS-NEXT:    nop
-; MIPS-NEXT:  # %bb.13: # %entry
-; MIPS-NEXT:    bnez $14, $BB5_19
-; MIPS-NEXT:    nop
-; MIPS-NEXT:  $BB5_14: # %entry
-; MIPS-NEXT:    beqz $12, $BB5_20
-; MIPS-NEXT:    nop
-; MIPS-NEXT:  $BB5_15: # %entry
-; MIPS-NEXT:    bnez $3, $BB5_21
-; MIPS-NEXT:    srav $4, $4, $2
-; MIPS-NEXT:  $BB5_16: # %entry
-; MIPS-NEXT:    srlv $1, $5, $2
-; MIPS-NEXT:    sllv $2, $13, $8
-; MIPS-NEXT:    or $3, $2, $1
-; MIPS-NEXT:    bnez $14, $BB5_23
-; MIPS-NEXT:    move $2, $4
-; MIPS-NEXT:  # %bb.17: # %entry
-; MIPS-NEXT:    b $BB5_22
-; MIPS-NEXT:    nop
-; MIPS-NEXT:  $BB5_18: # %entry
-; MIPS-NEXT:    beqz $14, $BB5_14
-; MIPS-NEXT:    move $7, $25
-; MIPS-NEXT:  $BB5_19:
-; MIPS-NEXT:    bnez $12, $BB5_15
-; MIPS-NEXT:    or $24, $11, $10
-; MIPS-NEXT:  $BB5_20: # %entry
-; MIPS-NEXT:    move $6, $24
-; MIPS-NEXT:    beqz $3, $BB5_16
-; MIPS-NEXT:    srav $4, $4, $2
-; MIPS-NEXT:  $BB5_21:
-; MIPS-NEXT:    move $2, $9
-; MIPS-NEXT:    bnez $14, $BB5_23
-; MIPS-NEXT:    move $3, $4
-; MIPS-NEXT:  $BB5_22: # %entry
-; MIPS-NEXT:    move $2, $9
-; MIPS-NEXT:  $BB5_23: # %entry
-; MIPS-NEXT:    bnez $14, $BB5_25
-; MIPS-NEXT:    nop
-; MIPS-NEXT:  # %bb.24: # %entry
-; MIPS-NEXT:    move $3, $9
-; MIPS-NEXT:  $BB5_25: # %entry
-; MIPS-NEXT:    move $4, $6
+; MIPS-NEXT:    addiu $sp, $sp, -32
+; MIPS-NEXT:    .cfi_def_cfa_offset 32
+; MIPS-NEXT:    swl $7, 28($sp)
+; MIPS-NEXT:    swl $6, 24($sp)
+; MIPS-NEXT:    sra $1, $4, 31
+; MIPS-NEXT:    swl $5, 20($sp)
+; MIPS-NEXT:    swl $4, 16($sp)
+; MIPS-NEXT:    swl $1, 12($sp)
+; MIPS-NEXT:    swl $1, 8($sp)
+; MIPS-NEXT:    swl $1, 4($sp)
+; MIPS-NEXT:    swl $1, 0($sp)
+; MIPS-NEXT:    addiu $2, $sp, 0
+; MIPS-NEXT:    swr $7, 31($sp)
+; MIPS-NEXT:    swr $6, 27($sp)
+; MIPS-NEXT:    swr $5, 23($sp)
+; MIPS-NEXT:    swr $4, 19($sp)
+; MIPS-NEXT:    swr $1, 15($sp)
+; MIPS-NEXT:    swr $1, 11($sp)
+; MIPS-NEXT:    swr $1, 7($sp)
+; MIPS-NEXT:    swr $1, 3($sp)
+; MIPS-NEXT:    addiu $1, $2, 16
+; MIPS-NEXT:    lw $2, 60($sp)
+; MIPS-NEXT:    srl $3, $2, 3
+; MIPS-NEXT:    andi $3, $3, 15
+; MIPS-NEXT:    subu $1, $1, $3
+; MIPS-NEXT:    lwl $3, 4($1)
+; MIPS-NEXT:    lwr $3, 7($1)
+; MIPS-NEXT:    sll $4, $3, 1
+; MIPS-NEXT:    lwl $5, 8($1)
+; MIPS-NEXT:    lwr $5, 11($1)
+; MIPS-NEXT:    andi $2, $2, 7
+; MIPS-NEXT:    not $6, $2
+; MIPS-NEXT:    andi $6, $6, 31
+; MIPS-NEXT:    srlv $7, $5, $2
+; MIPS-NEXT:    sllv $4, $4, $6
+; MIPS-NEXT:    srlv $3, $3, $2
+; MIPS-NEXT:    lwl $6, 0($1)
+; MIPS-NEXT:    lwr $6, 3($1)
+; MIPS-NEXT:    sll $8, $6, 1
+; MIPS-NEXT:    xori $9, $2, 31
+; MIPS-NEXT:    sllv $8, $8, $9
+; MIPS-NEXT:    or $3, $3, $8
+; MIPS-NEXT:    or $4, $7, $4
+; MIPS-NEXT:    lwl $7, 12($1)
+; MIPS-NEXT:    lwr $7, 15($1)
+; MIPS-NEXT:    srlv $1, $7, $2
+; MIPS-NEXT:    sll $5, $5, 1
+; MIPS-NEXT:    sllv $5, $5, $9
+; MIPS-NEXT:    or $5, $1, $5
+; MIPS-NEXT:    srav $2, $6, $2
 ; MIPS-NEXT:    jr $ra
-; MIPS-NEXT:    move $5, $7
+; MIPS-NEXT:    addiu $sp, $sp, 32
 ;
 ; MIPS32-LABEL: ashr_i128:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lw $9, 28($sp)
-; MIPS32-NEXT:    srlv $1, $7, $9
-; MIPS32-NEXT:    not $2, $9
-; MIPS32-NEXT:    sll $3, $6, 1
-; MIPS32-NEXT:    sllv $3, $3, $2
-; MIPS32-NEXT:    addiu $8, $zero, 64
-; MIPS32-NEXT:    or $1, $3, $1
-; MIPS32-NEXT:    srlv $10, $6, $9
-; MIPS32-NEXT:    subu $3, $8, $9
-; MIPS32-NEXT:    sllv $11, $5, $3
-; MIPS32-NEXT:    andi $12, $3, 32
-; MIPS32-NEXT:    andi $13, $9, 32
-; MIPS32-NEXT:    move $8, $11
-; MIPS32-NEXT:    movn $8, $zero, $12
-; MIPS32-NEXT:    movn $1, $10, $13
-; MIPS32-NEXT:    addiu $14, $9, -64
-; MIPS32-NEXT:    srlv $15, $5, $14
-; MIPS32-NEXT:    sll $24, $4, 1
-; MIPS32-NEXT:    not $25, $14
-; MIPS32-NEXT:    sllv $25, $24, $25
-; MIPS32-NEXT:    or $gp, $1, $8
-; MIPS32-NEXT:    or $1, $25, $15
-; MIPS32-NEXT:    srav $8, $4, $14
-; MIPS32-NEXT:    andi $14, $14, 32
-; MIPS32-NEXT:    movn $1, $8, $14
-; MIPS32-NEXT:    sllv $15, $4, $3
-; MIPS32-NEXT:    not $3, $3
-; MIPS32-NEXT:    srl $25, $5, 1
-; MIPS32-NEXT:    srlv $3, $25, $3
-; MIPS32-NEXT:    sltiu $25, $9, 64
-; MIPS32-NEXT:    movn $1, $gp, $25
-; MIPS32-NEXT:    or $15, $15, $3
-; MIPS32-NEXT:    srlv $3, $5, $9
-; MIPS32-NEXT:    sllv $2, $24, $2
-; MIPS32-NEXT:    or $5, $2, $3
-; MIPS32-NEXT:    srav $24, $4, $9
-; MIPS32-NEXT:    movn $5, $24, $13
-; MIPS32-NEXT:    sra $2, $4, 31
-; MIPS32-NEXT:    movz $1, $7, $9
-; MIPS32-NEXT:    move $3, $2
-; MIPS32-NEXT:    movn $3, $5, $25
-; MIPS32-NEXT:    movn $15, $11, $12
-; MIPS32-NEXT:    movn $10, $zero, $13
-; MIPS32-NEXT:    or $4, $10, $15
-; MIPS32-NEXT:    movn $8, $2, $14
-; MIPS32-NEXT:    movn $8, $4, $25
-; MIPS32-NEXT:    movz $8, $6, $9
-; MIPS32-NEXT:    movn $24, $2, $13
-; MIPS32-NEXT:    movn $2, $24, $25
-; MIPS32-NEXT:    move $4, $8
+; MIPS32-NEXT:    addiu $sp, $sp, -32
+; MIPS32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32-NEXT:    swl $7, 28($sp)
+; MIPS32-NEXT:    swl $6, 24($sp)
+; MIPS32-NEXT:    sra $1, $4, 31
+; MIPS32-NEXT:    swl $5, 20($sp)
+; MIPS32-NEXT:    swl $4, 16($sp)
+; MIPS32-NEXT:    swl $1, 12($sp)
+; MIPS32-NEXT:    swl $1, 8($sp)
+; MIPS32-NEXT:    swl $1, 4($sp)
+; MIPS32-NEXT:    swl $1, 0($sp)
+; MIPS32-NEXT:    addiu $2, $sp, 0
+; MIPS32-NEXT:    swr $7, 31($sp)
+; MIPS32-NEXT:    swr $6, 27($sp)
+; MIPS32-NEXT:    swr $5, 23($sp)
+; MIPS32-NEXT:    swr $4, 19($sp)
+; MIPS32-NEXT:    swr $1, 15($sp)
+; MIPS32-NEXT:    swr $1, 11($sp)
+; MIPS32-NEXT:    swr $1, 7($sp)
+; MIPS32-NEXT:    swr $1, 3($sp)
+; MIPS32-NEXT:    addiu $1, $2, 16
+; MIPS32-NEXT:    lw $2, 60($sp)
+; MIPS32-NEXT:    srl $3, $2, 3
+; MIPS32-NEXT:    andi $3, $3, 15
+; MIPS32-NEXT:    subu $1, $1, $3
+; MIPS32-NEXT:    lwl $3, 4($1)
+; MIPS32-NEXT:    lwr $3, 7($1)
+; MIPS32-NEXT:    sll $4, $3, 1
+; MIPS32-NEXT:    lwl $5, 8($1)
+; MIPS32-NEXT:    lwr $5, 11($1)
+; MIPS32-NEXT:    andi $2, $2, 7
+; MIPS32-NEXT:    not $6, $2
+; MIPS32-NEXT:    andi $6, $6, 31
+; MIPS32-NEXT:    srlv $7, $5, $2
+; MIPS32-NEXT:    sllv $4, $4, $6
+; MIPS32-NEXT:    srlv $3, $3, $2
+; MIPS32-NEXT:    lwl $6, 0($1)
+; MIPS32-NEXT:    lwr $6, 3($1)
+; MIPS32-NEXT:    sll $8, $6, 1
+; MIPS32-NEXT:    xori $9, $2, 31
+; MIPS32-NEXT:    sllv $8, $8, $9
+; MIPS32-NEXT:    or $3, $3, $8
+; MIPS32-NEXT:    or $4, $7, $4
+; MIPS32-NEXT:    lwl $7, 12($1)
+; MIPS32-NEXT:    lwr $7, 15($1)
+; MIPS32-NEXT:    srlv $1, $7, $2
+; MIPS32-NEXT:    sll $5, $5, 1
+; MIPS32-NEXT:    sllv $5, $5, $9
+; MIPS32-NEXT:    or $5, $1, $5
+; MIPS32-NEXT:    srav $2, $6, $2
 ; MIPS32-NEXT:    jr $ra
-; MIPS32-NEXT:    move $5, $1
+; MIPS32-NEXT:    addiu $sp, $sp, 32
 ;
 ; 32R2-LABEL: ashr_i128:
 ; 32R2:       # %bb.0: # %entry
-; 32R2-NEXT:    lw $9, 28($sp)
-; 32R2-NEXT:    srlv $1, $7, $9
-; 32R2-NEXT:    not $2, $9
-; 32R2-NEXT:    sll $3, $6, 1
-; 32R2-NEXT:    sllv $3, $3, $2
-; 32R2-NEXT:    addiu $8, $zero, 64
-; 32R2-NEXT:    or $1, $3, $1
-; 32R2-NEXT:    srlv $10, $6, $9
-; 32R2-NEXT:    subu $3, $8, $9
-; 32R2-NEXT:    sllv $11, $5, $3
-; 32R2-NEXT:    andi $12, $3, 32
-; 32R2-NEXT:    andi $13, $9, 32
-; 32R2-NEXT:    move $8, $11
-; 32R2-NEXT:    movn $8, $zero, $12
-; 32R2-NEXT:    movn $1, $10, $13
-; 32R2-NEXT:    addiu $14, $9, -64
-; 32R2-NEXT:    srlv $15, $5, $14
-; 32R2-NEXT:    sll $24, $4, 1
-; 32R2-NEXT:    not $25, $14
-; 32R2-NEXT:    sllv $25, $24, $25
-; 32R2-NEXT:    or $gp, $1, $8
-; 32R2-NEXT:    or $1, $25, $15
-; 32R2-NEXT:    srav $8, $4, $14
-; 32R2-NEXT:    andi $14, $14, 32
-; 32R2-NEXT:    movn $1, $8, $14
-; 32R2-NEXT:    sllv $15, $4, $3
-; 32R2-NEXT:    not $3, $3
-; 32R2-NEXT:    srl $25, $5, 1
-; 32R2-NEXT:    srlv $3, $25, $3
-; 32R2-NEXT:    sltiu $25, $9, 64
-; 32R2-NEXT:    movn $1, $gp, $25
-; 32R2-NEXT:    or $15, $15, $3
-; 32R2-NEXT:    srlv $3, $5, $9
-; 32R2-NEXT:    sllv $2, $24, $2
-; 32R2-NEXT:    or $5, $2, $3
-; 32R2-NEXT:    srav $24, $4, $9
-; 32R2-NEXT:    movn $5, $24, $13
-; 32R2-NEXT:    sra $2, $4, 31
-; 32R2-NEXT:    movz $1, $7, $9
-; 32R2-NEXT:    move $3, $2
-; 32R2-NEXT:    movn $3, $5, $25
-; 32R2-NEXT:    movn $15, $11, $12
-; 32R2-NEXT:    movn $10, $zero, $13
-; 32R2-NEXT:    or $4, $10, $15
-; 32R2-NEXT:    movn $8, $2, $14
-; 32R2-NEXT:    movn $8, $4, $25
-; 32R2-NEXT:    movz $8, $6, $9
-; 32R2-NEXT:    movn $24, $2, $13
-; 32R2-NEXT:    movn $2, $24, $25
-; 32R2-NEXT:    move $4, $8
+; 32R2-NEXT:    addiu $sp, $sp, -32
+; 32R2-NEXT:    .cfi_def_cfa_offset 32
+; 32R2-NEXT:    swl $7, 28($sp)
+; 32R2-NEXT:    swl $6, 24($sp)
+; 32R2-NEXT:    swl $5, 20($sp)
+; 32R2-NEXT:    sra $1, $4, 31
+; 32R2-NEXT:    swl $4, 16($sp)
+; 32R2-NEXT:    swl $1, 12($sp)
+; 32R2-NEXT:    swl $1, 8($sp)
+; 32R2-NEXT:    swl $1, 4($sp)
+; 32R2-NEXT:    swl $1, 0($sp)
+; 32R2-NEXT:    swr $7, 31($sp)
+; 32R2-NEXT:    swr $6, 27($sp)
+; 32R2-NEXT:    swr $5, 23($sp)
+; 32R2-NEXT:    swr $4, 19($sp)
+; 32R2-NEXT:    swr $1, 15($sp)
+; 32R2-NEXT:    swr $1, 11($sp)
+; 32R2-NEXT:    swr $1, 7($sp)
+; 32R2-NEXT:    swr $1, 3($sp)
+; 32R2-NEXT:    addiu $1, $sp, 0
+; 32R2-NEXT:    addiu $1, $1, 16
+; 32R2-NEXT:    lw $2, 60($sp)
+; 32R2-NEXT:    ext $3, $2, 3, 4
+; 32R2-NEXT:    subu $1, $1, $3
+; 32R2-NEXT:    lwl $3, 4($1)
+; 32R2-NEXT:    lwr $3, 7($1)
+; 32R2-NEXT:    sll $4, $3, 1
+; 32R2-NEXT:    lwl $5, 8($1)
+; 32R2-NEXT:    lwr $5, 11($1)
+; 32R2-NEXT:    andi $2, $2, 7
+; 32R2-NEXT:    not $6, $2
+; 32R2-NEXT:    andi $6, $6, 31
+; 32R2-NEXT:    srlv $7, $5, $2
+; 32R2-NEXT:    sllv $4, $4, $6
+; 32R2-NEXT:    srlv $3, $3, $2
+; 32R2-NEXT:    lwl $6, 0($1)
+; 32R2-NEXT:    lwr $6, 3($1)
+; 32R2-NEXT:    sll $8, $6, 1
+; 32R2-NEXT:    xori $9, $2, 31
+; 32R2-NEXT:    sllv $8, $8, $9
+; 32R2-NEXT:    or $3, $3, $8
+; 32R2-NEXT:    or $4, $7, $4
+; 32R2-NEXT:    lwl $7, 12($1)
+; 32R2-NEXT:    lwr $7, 15($1)
+; 32R2-NEXT:    srlv $1, $7, $2
+; 32R2-NEXT:    sll $5, $5, 1
+; 32R2-NEXT:    sllv $5, $5, $9
+; 32R2-NEXT:    or $5, $1, $5
+; 32R2-NEXT:    srav $2, $6, $2
 ; 32R2-NEXT:    jr $ra
-; 32R2-NEXT:    move $5, $1
+; 32R2-NEXT:    addiu $sp, $sp, 32
 ;
 ; 32R6-LABEL: ashr_i128:
 ; 32R6:       # %bb.0: # %entry
-; 32R6-NEXT:    lw $3, 28($sp)
-; 32R6-NEXT:    addiu $1, $zero, 64
+; 32R6-NEXT:    addiu $sp, $sp, -32
+; 32R6-NEXT:    .cfi_def_cfa_offset 32
+; 32R6-NEXT:    sra $1, $4, 31
+; 32R6-NEXT:    sw $7, 28($sp)
+; 32R6-NEXT:    sw $6, 24($sp)
+; 32R6-NEXT:    sw $5, 20($sp)
+; 32R6-NEXT:    sw $4, 16($sp)
+; 32R6-NEXT:    sw $1, 12($sp)
+; 32R6-NEXT:    sw $1, 8($sp)
+; 32R6-NEXT:    sw $1, 4($sp)
+; 32R6-NEXT:    sw $1, 0($sp)
+; 32R6-NEXT:    addiu $1, $sp, 0
+; 32R6-NEXT:    addiu $1, $1, 16
+; 32R6-NEXT:    lw $2, 60($sp)
+; 32R6-NEXT:    ext $3, $2, 3, 4
 ; 32R6-NEXT:    subu $1, $1, $3
-; 32R6-NEXT:    sllv $2, $5, $1
-; 32R6-NEXT:    andi $8, $1, 32
-; 32R6-NEXT:    selnez $9, $2, $8
-; 32R6-NEXT:    sllv $10, $4, $1
-; 32R6-NEXT:    not $1, $1
-; 32R6-NEXT:    srl $11, $5, 1
-; 32R6-NEXT:    srlv $1, $11, $1
-; 32R6-NEXT:    or $1, $10, $1
-; 32R6-NEXT:    seleqz $1, $1, $8
-; 32R6-NEXT:    or $1, $9, $1
-; 32R6-NEXT:    srlv $9, $7, $3
-; 32R6-NEXT:    not $10, $3
-; 32R6-NEXT:    sll $11, $6, 1
-; 32R6-NEXT:    sllv $11, $11, $10
-; 32R6-NEXT:    or $9, $11, $9
-; 32R6-NEXT:    andi $11, $3, 32
-; 32R6-NEXT:    seleqz $9, $9, $11
-; 32R6-NEXT:    srlv $12, $6, $3
-; 32R6-NEXT:    selnez $13, $12, $11
-; 32R6-NEXT:    seleqz $12, $12, $11
-; 32R6-NEXT:    or $1, $12, $1
-; 32R6-NEXT:    seleqz $2, $2, $8
-; 32R6-NEXT:    or $8, $13, $9
-; 32R6-NEXT:    addiu $9, $3, -64
-; 32R6-NEXT:    srlv $12, $5, $9
-; 32R6-NEXT:    sll $13, $4, 1
-; 32R6-NEXT:    not $14, $9
-; 32R6-NEXT:    sllv $14, $13, $14
-; 32R6-NEXT:    sltiu $15, $3, 64
-; 32R6-NEXT:    or $2, $8, $2
-; 32R6-NEXT:    selnez $1, $1, $15
-; 32R6-NEXT:    or $8, $14, $12
-; 32R6-NEXT:    srav $12, $4, $9
-; 32R6-NEXT:    andi $9, $9, 32
-; 32R6-NEXT:    seleqz $14, $12, $9
-; 32R6-NEXT:    sra $24, $4, 31
-; 32R6-NEXT:    selnez $25, $24, $9
-; 32R6-NEXT:    seleqz $8, $8, $9
-; 32R6-NEXT:    or $14, $25, $14
-; 32R6-NEXT:    seleqz $14, $14, $15
-; 32R6-NEXT:    selnez $9, $12, $9
-; 32R6-NEXT:    seleqz $12, $24, $15
-; 32R6-NEXT:    or $1, $1, $14
-; 32R6-NEXT:    selnez $14, $1, $3
-; 32R6-NEXT:    selnez $1, $2, $15
-; 32R6-NEXT:    or $2, $9, $8
-; 32R6-NEXT:    srav $8, $4, $3
-; 32R6-NEXT:    seleqz $4, $8, $11
-; 32R6-NEXT:    selnez $9, $24, $11
-; 32R6-NEXT:    or $4, $9, $4
-; 32R6-NEXT:    selnez $9, $4, $15
-; 32R6-NEXT:    seleqz $2, $2, $15
-; 32R6-NEXT:    seleqz $4, $6, $3
-; 32R6-NEXT:    seleqz $6, $7, $3
-; 32R6-NEXT:    or $1, $1, $2
-; 32R6-NEXT:    selnez $1, $1, $3
-; 32R6-NEXT:    or $1, $6, $1
-; 32R6-NEXT:    or $4, $4, $14
-; 32R6-NEXT:    or $2, $9, $12
-; 32R6-NEXT:    srlv $3, $5, $3
-; 32R6-NEXT:    sllv $5, $13, $10
-; 32R6-NEXT:    or $3, $5, $3
-; 32R6-NEXT:    seleqz $3, $3, $11
-; 32R6-NEXT:    selnez $5, $8, $11
-; 32R6-NEXT:    or $3, $5, $3
-; 32R6-NEXT:    selnez $3, $3, $15
-; 32R6-NEXT:    or $3, $3, $12
+; 32R6-NEXT:    lw $3, 4($1)
+; 32R6-NEXT:    sll $4, $3, 1
+; 32R6-NEXT:    lw $5, 8($1)
+; 32R6-NEXT:    andi $2, $2, 7
+; 32R6-NEXT:    not $6, $2
+; 32R6-NEXT:    andi $6, $6, 31
+; 32R6-NEXT:    srlv $7, $5, $2
+; 32R6-NEXT:    sllv $4, $4, $6
+; 32R6-NEXT:    srlv $3, $3, $2
+; 32R6-NEXT:    lw $6, 0($1)
+; 32R6-NEXT:    sll $8, $6, 1
+; 32R6-NEXT:    xori $9, $2, 31
+; 32R6-NEXT:    sllv $8, $8, $9
+; 32R6-NEXT:    or $3, $3, $8
+; 32R6-NEXT:    or $4, $7, $4
+; 32R6-NEXT:    lw $1, 12($1)
+; 32R6-NEXT:    srlv $1, $1, $2
+; 32R6-NEXT:    sll $5, $5, 1
+; 32R6-NEXT:    sllv $5, $5, $9
+; 32R6-NEXT:    or $5, $1, $5
+; 32R6-NEXT:    srav $2, $6, $2
 ; 32R6-NEXT:    jr $ra
-; 32R6-NEXT:    move $5, $1
+; 32R6-NEXT:    addiu $sp, $sp, 32
 ;
 ; MIPS3-LABEL: ashr_i128:
 ; MIPS3:       # %bb.0: # %entry
@@ -760,175 +671,105 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ;
 ; MMR3-LABEL: ashr_i128:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    addiusp -48
-; MMR3-NEXT:    .cfi_def_cfa_offset 48
-; MMR3-NEXT:    swp $16, 40($sp)
+; MMR3-NEXT:    addiusp -40
+; MMR3-NEXT:    .cfi_def_cfa_offset 40
+; MMR3-NEXT:    swp $16, 32($sp)
 ; MMR3-NEXT:    .cfi_offset 17, -4
 ; MMR3-NEXT:    .cfi_offset 16, -8
-; MMR3-NEXT:    move $8, $7
-; MMR3-NEXT:    sw $6, 32($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    sw $5, 36($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    sw $4, 8($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    lw $16, 76($sp)
-; MMR3-NEXT:    srlv $4, $7, $16
-; MMR3-NEXT:    not16 $3, $16
-; MMR3-NEXT:    sw $3, 24($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    sll16 $2, $6, 1
-; MMR3-NEXT:    sllv $3, $2, $3
-; MMR3-NEXT:    li16 $2, 64
-; MMR3-NEXT:    or16 $3, $4
-; MMR3-NEXT:    srlv $6, $6, $16
-; MMR3-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    subu16 $7, $2, $16
-; MMR3-NEXT:    sllv $9, $5, $7
-; MMR3-NEXT:    andi16 $2, $7, 32
-; MMR3-NEXT:    sw $2, 28($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    andi16 $5, $16, 32
-; MMR3-NEXT:    sw $5, 16($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    move $4, $9
-; MMR3-NEXT:    li16 $17, 0
-; MMR3-NEXT:    movn $4, $17, $2
-; MMR3-NEXT:    movn $3, $6, $5
-; MMR3-NEXT:    addiu $2, $16, -64
-; MMR3-NEXT:    lw $5, 36($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    srlv $5, $5, $2
-; MMR3-NEXT:    sw $5, 20($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    lw $17, 8($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    sll16 $6, $17, 1
-; MMR3-NEXT:    sw $6, 4($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    not16 $5, $2
-; MMR3-NEXT:    sllv $5, $6, $5
-; MMR3-NEXT:    or16 $3, $4
-; MMR3-NEXT:    lw $4, 20($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    or16 $5, $4
-; MMR3-NEXT:    srav $1, $17, $2
-; MMR3-NEXT:    andi16 $2, $2, 32
-; MMR3-NEXT:    sw $2, 20($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    movn $5, $1, $2
-; MMR3-NEXT:    sllv $2, $17, $7
-; MMR3-NEXT:    not16 $4, $7
-; MMR3-NEXT:    lw $7, 36($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    srl16 $6, $7, 1
-; MMR3-NEXT:    srlv $6, $6, $4
-; MMR3-NEXT:    sltiu $10, $16, 64
-; MMR3-NEXT:    movn $5, $3, $10
-; MMR3-NEXT:    or16 $6, $2
-; MMR3-NEXT:    srlv $2, $7, $16
-; MMR3-NEXT:    lw $3, 24($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    lw $4, 4($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    sllv $3, $4, $3
-; MMR3-NEXT:    or16 $3, $2
-; MMR3-NEXT:    srav $11, $17, $16
-; MMR3-NEXT:    lw $4, 16($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    movn $3, $11, $4
-; MMR3-NEXT:    sra $2, $17, 31
-; MMR3-NEXT:    movz $5, $8, $16
-; MMR3-NEXT:    move $8, $2
-; MMR3-NEXT:    movn $8, $3, $10
-; MMR3-NEXT:    lw $3, 28($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    movn $6, $9, $3
-; MMR3-NEXT:    li16 $3, 0
-; MMR3-NEXT:    lw $7, 12($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    movn $7, $3, $4
-; MMR3-NEXT:    or16 $7, $6
-; MMR3-NEXT:    lw $3, 20($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    movn $1, $2, $3
-; MMR3-NEXT:    movn $1, $7, $10
-; MMR3-NEXT:    lw $3, 32($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    movz $1, $3, $16
-; MMR3-NEXT:    movn $11, $2, $4
-; MMR3-NEXT:    movn $2, $11, $10
-; MMR3-NEXT:    move $3, $8
-; MMR3-NEXT:    move $4, $1
-; MMR3-NEXT:    lwp $16, 40($sp)
-; MMR3-NEXT:    addiusp 48
+; MMR3-NEXT:    swl $7, 28($sp)
+; MMR3-NEXT:    swl $6, 24($sp)
+; MMR3-NEXT:    swl $5, 20($sp)
+; MMR3-NEXT:    sra $1, $4, 31
+; MMR3-NEXT:    swl $4, 16($sp)
+; MMR3-NEXT:    swl $1, 12($sp)
+; MMR3-NEXT:    swl $1, 8($sp)
+; MMR3-NEXT:    swl $1, 4($sp)
+; MMR3-NEXT:    swl $1, 0($sp)
+; MMR3-NEXT:    swr $7, 31($sp)
+; MMR3-NEXT:    swr $6, 27($sp)
+; MMR3-NEXT:    swr $5, 23($sp)
+; MMR3-NEXT:    swr $4, 19($sp)
+; MMR3-NEXT:    swr $1, 15($sp)
+; MMR3-NEXT:    swr $1, 11($sp)
+; MMR3-NEXT:    swr $1, 7($sp)
+; MMR3-NEXT:    swr $1, 3($sp)
+; MMR3-NEXT:    addiur1sp $2, 0
+; MMR3-NEXT:    addiur2 $2, $2, 16
+; MMR3-NEXT:    lw $3, 68($sp)
+; MMR3-NEXT:    ext $4, $3, 3, 4
+; MMR3-NEXT:    subu16 $2, $2, $4
+; MMR3-NEXT:    lwl $7, 4($2)
+; MMR3-NEXT:    lwr $7, 7($2)
+; MMR3-NEXT:    sll16 $4, $7, 1
+; MMR3-NEXT:    lwl $5, 8($2)
+; MMR3-NEXT:    lwr $5, 11($2)
+; MMR3-NEXT:    andi16 $6, $3, 7
+; MMR3-NEXT:    not16 $3, $6
+; MMR3-NEXT:    andi16 $3, $3, 31
+; MMR3-NEXT:    srlv $16, $5, $6
+; MMR3-NEXT:    sllv $4, $4, $3
+; MMR3-NEXT:    srlv $17, $7, $6
+; MMR3-NEXT:    lwl $7, 0($2)
+; MMR3-NEXT:    lwr $7, 3($2)
+; MMR3-NEXT:    sll16 $3, $7, 1
+; MMR3-NEXT:    xori $1, $6, 31
+; MMR3-NEXT:    sllv $3, $3, $1
+; MMR3-NEXT:    or16 $3, $17
+; MMR3-NEXT:    or16 $4, $16
+; MMR3-NEXT:    lwl $8, 12($2)
+; MMR3-NEXT:    lwr $8, 15($2)
+; MMR3-NEXT:    srlv $2, $8, $6
+; MMR3-NEXT:    sll16 $5, $5, 1
+; MMR3-NEXT:    sllv $5, $5, $1
+; MMR3-NEXT:    or16 $5, $2
+; MMR3-NEXT:    srav $2, $7, $6
+; MMR3-NEXT:    lwp $16, 32($sp)
+; MMR3-NEXT:    addiusp 40
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: ashr_i128:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    addiu $sp, $sp, -16
-; MMR6-NEXT:    .cfi_def_cfa_offset 16
-; MMR6-NEXT:    sw $17, 12($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    sw $16, 8($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    .cfi_offset 17, -4
-; MMR6-NEXT:    .cfi_offset 16, -8
-; MMR6-NEXT:    move $1, $7
-; MMR6-NEXT:    lw $3, 44($sp)
-; MMR6-NEXT:    li16 $2, 64
-; MMR6-NEXT:    subu16 $7, $2, $3
-; MMR6-NEXT:    sllv $8, $5, $7
-; MMR6-NEXT:    andi16 $2, $7, 32
-; MMR6-NEXT:    selnez $9, $8, $2
-; MMR6-NEXT:    sllv $10, $4, $7
-; MMR6-NEXT:    not16 $7, $7
-; MMR6-NEXT:    srl16 $16, $5, 1
-; MMR6-NEXT:    srlv $7, $16, $7
-; MMR6-NEXT:    or $7, $10, $7
-; MMR6-NEXT:    seleqz $7, $7, $2
-; MMR6-NEXT:    or $7, $9, $7
-; MMR6-NEXT:    srlv $9, $1, $3
-; MMR6-NEXT:    not16 $16, $3
-; MMR6-NEXT:    sw $16, 4($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    sll16 $17, $6, 1
-; MMR6-NEXT:    sllv $10, $17, $16
-; MMR6-NEXT:    or $9, $10, $9
-; MMR6-NEXT:    andi16 $17, $3, 32
-; MMR6-NEXT:    seleqz $9, $9, $17
-; MMR6-NEXT:    srlv $10, $6, $3
-; MMR6-NEXT:    selnez $11, $10, $17
-; MMR6-NEXT:    seleqz $10, $10, $17
-; MMR6-NEXT:    or $10, $10, $7
-; MMR6-NEXT:    seleqz $12, $8, $2
-; MMR6-NEXT:    or $8, $11, $9
-; MMR6-NEXT:    addiu $2, $3, -64
-; MMR6-NEXT:    srlv $9, $5, $2
-; MMR6-NEXT:    sll16 $7, $4, 1
-; MMR6-NEXT:    not16 $16, $2
-; MMR6-NEXT:    sllv $11, $7, $16
-; MMR6-NEXT:    sltiu $13, $3, 64
-; MMR6-NEXT:    or $8, $8, $12
-; MMR6-NEXT:    selnez $10, $10, $13
-; MMR6-NEXT:    or $9, $11, $9
-; MMR6-NEXT:    srav $11, $4, $2
-; MMR6-NEXT:    andi16 $2, $2, 32
-; MMR6-NEXT:    seleqz $12, $11, $2
-; MMR6-NEXT:    sra $14, $4, 31
-; MMR6-NEXT:    selnez $15, $14, $2
-; MMR6-NEXT:    seleqz $9, $9, $2
-; MMR6-NEXT:    or $12, $15, $12
-; MMR6-NEXT:    seleqz $12, $12, $13
-; MMR6-NEXT:    selnez $2, $11, $2
-; MMR6-NEXT:    seleqz $11, $14, $13
-; MMR6-NEXT:    or $10, $10, $12
-; MMR6-NEXT:    selnez $10, $10, $3
-; MMR6-NEXT:    selnez $8, $8, $13
-; MMR6-NEXT:    or $2, $2, $9
-; MMR6-NEXT:    srav $9, $4, $3
-; MMR6-NEXT:    seleqz $4, $9, $17
-; MMR6-NEXT:    selnez $12, $14, $17
-; MMR6-NEXT:    or $4, $12, $4
-; MMR6-NEXT:    selnez $12, $4, $13
-; MMR6-NEXT:    seleqz $2, $2, $13
-; MMR6-NEXT:    seleqz $4, $6, $3
-; MMR6-NEXT:    seleqz $1, $1, $3
-; MMR6-NEXT:    or $2, $8, $2
-; MMR6-NEXT:    selnez $2, $2, $3
-; MMR6-NEXT:    or $1, $1, $2
-; MMR6-NEXT:    or $4, $4, $10
-; MMR6-NEXT:    or $2, $12, $11
-; MMR6-NEXT:    srlv $3, $5, $3
-; MMR6-NEXT:    lw $5, 4($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    sllv $5, $7, $5
-; MMR6-NEXT:    or $3, $5, $3
-; MMR6-NEXT:    seleqz $3, $3, $17
-; MMR6-NEXT:    selnez $5, $9, $17
-; MMR6-NEXT:    or $3, $5, $3
-; MMR6-NEXT:    selnez $3, $3, $13
-; MMR6-NEXT:    or $3, $3, $11
-; MMR6-NEXT:    move $5, $1
-; MMR6-NEXT:    lw $16, 8($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    lw $17, 12($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    addiu $sp, $sp, 16
+; MMR6-NEXT:    addiu $sp, $sp, -40
+; MMR6-NEXT:    .cfi_def_cfa_offset 40
+; MMR6-NEXT:    sw $16, 36($sp) # 4-byte Folded Spill
+; MMR6-NEXT:    .cfi_offset 16, -4
+; MMR6-NEXT:    sra $1, $4, 31
+; MMR6-NEXT:    sw $7, 32($sp)
+; MMR6-NEXT:    sw $6, 28($sp)
+; MMR6-NEXT:    sw $5, 24($sp)
+; MMR6-NEXT:    sw $4, 20($sp)
+; MMR6-NEXT:    sw $1, 16($sp)
+; MMR6-NEXT:    sw $1, 12($sp)
+; MMR6-NEXT:    sw $1, 8($sp)
+; MMR6-NEXT:    sw $1, 4($sp)
+; MMR6-NEXT:    addiu $2, $sp, 4
+; MMR6-NEXT:    addiur2 $2, $2, 16
+; MMR6-NEXT:    lw $3, 68($sp)
+; MMR6-NEXT:    ext $4, $3, 3, 4
+; MMR6-NEXT:    subu16 $5, $2, $4
+; MMR6-NEXT:    lw16 $4, 4($5)
+; MMR6-NEXT:    sll16 $6, $4, 1
+; MMR6-NEXT:    lw16 $7, 8($5)
+; MMR6-NEXT:    andi16 $2, $3, 7
+; MMR6-NEXT:    not16 $3, $2
+; MMR6-NEXT:    andi16 $3, $3, 31
+; MMR6-NEXT:    srlv $1, $7, $2
+; MMR6-NEXT:    sllv $6, $6, $3
+; MMR6-NEXT:    srlv $3, $4, $2
+; MMR6-NEXT:    lw16 $16, 0($5)
+; MMR6-NEXT:    sll16 $4, $16, 1
+; MMR6-NEXT:    xori $8, $2, 31
+; MMR6-NEXT:    sllv $4, $4, $8
+; MMR6-NEXT:    or $3, $3, $4
+; MMR6-NEXT:    or $4, $1, $6
+; MMR6-NEXT:    lw16 $5, 12($5)
+; MMR6-NEXT:    srlv $1, $5, $2
+; MMR6-NEXT:    sll16 $5, $7, 1
+; MMR6-NEXT:    sllv $5, $5, $8
+; MMR6-NEXT:    or $5, $1, $5
+; MMR6-NEXT:    srav $2, $16, $2
+; MMR6-NEXT:    lw $16, 36($sp) # 4-byte Folded Reload
+; MMR6-NEXT:    addiu $sp, $sp, 40
 ; MMR6-NEXT:    jrc $ra
 entry:
   %r = ashr i128 %a, %b

diff  --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
index 9acdf25857117..c4e05117d28e1 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -396,299 +396,205 @@ entry:
 define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS2-LABEL: lshr_i128:
 ; MIPS2:       # %bb.0: # %entry
-; MIPS2-NEXT:    lw $2, 28($sp)
-; MIPS2-NEXT:    addiu $1, $zero, 64
-; MIPS2-NEXT:    subu $12, $1, $2
-; MIPS2-NEXT:    sllv $10, $5, $12
-; MIPS2-NEXT:    andi $15, $12, 32
-; MIPS2-NEXT:    andi $8, $2, 32
-; MIPS2-NEXT:    addiu $3, $zero, 0
-; MIPS2-NEXT:    bnez $15, $BB5_2
-; MIPS2-NEXT:    addiu $13, $zero, 0
-; MIPS2-NEXT:  # %bb.1: # %entry
-; MIPS2-NEXT:    move $13, $10
-; MIPS2-NEXT:  $BB5_2: # %entry
-; MIPS2-NEXT:    not $9, $2
-; MIPS2-NEXT:    bnez $8, $BB5_5
-; MIPS2-NEXT:    srlv $24, $6, $2
-; MIPS2-NEXT:  # %bb.3: # %entry
-; MIPS2-NEXT:    sll $1, $6, 1
-; MIPS2-NEXT:    srlv $11, $7, $2
-; MIPS2-NEXT:    sllv $1, $1, $9
-; MIPS2-NEXT:    or $14, $1, $11
-; MIPS2-NEXT:    bnez $15, $BB5_7
-; MIPS2-NEXT:    move $11, $24
-; MIPS2-NEXT:  # %bb.4: # %entry
-; MIPS2-NEXT:    b $BB5_6
-; MIPS2-NEXT:    nop
-; MIPS2-NEXT:  $BB5_5:
-; MIPS2-NEXT:    addiu $11, $zero, 0
-; MIPS2-NEXT:    bnez $15, $BB5_7
-; MIPS2-NEXT:    move $14, $24
-; MIPS2-NEXT:  $BB5_6: # %entry
-; MIPS2-NEXT:    sllv $1, $4, $12
-; MIPS2-NEXT:    not $10, $12
-; MIPS2-NEXT:    srl $12, $5, 1
-; MIPS2-NEXT:    srlv $10, $12, $10
-; MIPS2-NEXT:    or $10, $1, $10
-; MIPS2-NEXT:  $BB5_7: # %entry
-; MIPS2-NEXT:    addiu $15, $2, -64
-; MIPS2-NEXT:    sll $12, $4, 1
-; MIPS2-NEXT:    andi $1, $15, 32
-; MIPS2-NEXT:    bnez $1, $BB5_10
-; MIPS2-NEXT:    srlv $25, $4, $15
-; MIPS2-NEXT:  # %bb.8: # %entry
-; MIPS2-NEXT:    srlv $1, $5, $15
-; MIPS2-NEXT:    not $15, $15
-; MIPS2-NEXT:    sllv $15, $12, $15
-; MIPS2-NEXT:    or $24, $15, $1
-; MIPS2-NEXT:    move $15, $25
-; MIPS2-NEXT:    sltiu $25, $2, 64
-; MIPS2-NEXT:    beqz $25, $BB5_12
-; MIPS2-NEXT:    nop
-; MIPS2-NEXT:  # %bb.9: # %entry
-; MIPS2-NEXT:    b $BB5_11
-; MIPS2-NEXT:    nop
-; MIPS2-NEXT:  $BB5_10:
-; MIPS2-NEXT:    move $24, $25
-; MIPS2-NEXT:    sltiu $25, $2, 64
-; MIPS2-NEXT:    beqz $25, $BB5_12
-; MIPS2-NEXT:    addiu $15, $zero, 0
-; MIPS2-NEXT:  $BB5_11:
-; MIPS2-NEXT:    or $24, $14, $13
-; MIPS2-NEXT:  $BB5_12: # %entry
-; MIPS2-NEXT:    sltiu $13, $2, 1
-; MIPS2-NEXT:    beqz $13, $BB5_19
-; MIPS2-NEXT:    nop
-; MIPS2-NEXT:  # %bb.13: # %entry
-; MIPS2-NEXT:    bnez $25, $BB5_20
-; MIPS2-NEXT:    nop
-; MIPS2-NEXT:  $BB5_14: # %entry
-; MIPS2-NEXT:    bnez $13, $BB5_16
-; MIPS2-NEXT:    addiu $10, $zero, 63
-; MIPS2-NEXT:  $BB5_15: # %entry
-; MIPS2-NEXT:    move $6, $15
-; MIPS2-NEXT:  $BB5_16: # %entry
-; MIPS2-NEXT:    sltu $10, $10, $2
-; MIPS2-NEXT:    bnez $8, $BB5_22
-; MIPS2-NEXT:    srlv $11, $4, $2
-; MIPS2-NEXT:  # %bb.17: # %entry
-; MIPS2-NEXT:    srlv $1, $5, $2
-; MIPS2-NEXT:    sllv $2, $12, $9
-; MIPS2-NEXT:    or $4, $2, $1
-; MIPS2-NEXT:    move $5, $11
-; MIPS2-NEXT:    bnez $10, $BB5_24
-; MIPS2-NEXT:    addiu $2, $zero, 0
-; MIPS2-NEXT:  # %bb.18: # %entry
-; MIPS2-NEXT:    b $BB5_23
-; MIPS2-NEXT:    nop
-; MIPS2-NEXT:  $BB5_19: # %entry
-; MIPS2-NEXT:    beqz $25, $BB5_14
-; MIPS2-NEXT:    move $7, $24
-; MIPS2-NEXT:  $BB5_20:
-; MIPS2-NEXT:    or $15, $11, $10
-; MIPS2-NEXT:    bnez $13, $BB5_16
-; MIPS2-NEXT:    addiu $10, $zero, 63
-; MIPS2-NEXT:  # %bb.21:
-; MIPS2-NEXT:    b $BB5_15
-; MIPS2-NEXT:    nop
-; MIPS2-NEXT:  $BB5_22:
-; MIPS2-NEXT:    addiu $5, $zero, 0
-; MIPS2-NEXT:    move $4, $11
-; MIPS2-NEXT:    bnez $10, $BB5_24
-; MIPS2-NEXT:    addiu $2, $zero, 0
-; MIPS2-NEXT:  $BB5_23: # %entry
-; MIPS2-NEXT:    move $2, $5
-; MIPS2-NEXT:  $BB5_24: # %entry
-; MIPS2-NEXT:    bnez $10, $BB5_26
-; MIPS2-NEXT:    nop
-; MIPS2-NEXT:  # %bb.25: # %entry
-; MIPS2-NEXT:    move $3, $4
-; MIPS2-NEXT:  $BB5_26: # %entry
-; MIPS2-NEXT:    move $4, $6
+; MIPS2-NEXT:    addiu $sp, $sp, -32
+; MIPS2-NEXT:    .cfi_def_cfa_offset 32
+; MIPS2-NEXT:    swl $7, 28($sp)
+; MIPS2-NEXT:    swl $6, 24($sp)
+; MIPS2-NEXT:    swl $5, 20($sp)
+; MIPS2-NEXT:    swl $4, 16($sp)
+; MIPS2-NEXT:    swl $zero, 12($sp)
+; MIPS2-NEXT:    swl $zero, 8($sp)
+; MIPS2-NEXT:    swl $zero, 4($sp)
+; MIPS2-NEXT:    swl $zero, 0($sp)
+; MIPS2-NEXT:    addiu $1, $sp, 0
+; MIPS2-NEXT:    swr $7, 31($sp)
+; MIPS2-NEXT:    swr $6, 27($sp)
+; MIPS2-NEXT:    swr $5, 23($sp)
+; MIPS2-NEXT:    swr $4, 19($sp)
+; MIPS2-NEXT:    swr $zero, 15($sp)
+; MIPS2-NEXT:    swr $zero, 11($sp)
+; MIPS2-NEXT:    swr $zero, 7($sp)
+; MIPS2-NEXT:    swr $zero, 3($sp)
+; MIPS2-NEXT:    addiu $1, $1, 16
+; MIPS2-NEXT:    lw $2, 60($sp)
+; MIPS2-NEXT:    srl $3, $2, 3
+; MIPS2-NEXT:    andi $3, $3, 15
+; MIPS2-NEXT:    subu $1, $1, $3
+; MIPS2-NEXT:    lwl $3, 4($1)
+; MIPS2-NEXT:    lwr $3, 7($1)
+; MIPS2-NEXT:    sll $4, $3, 1
+; MIPS2-NEXT:    lwl $5, 8($1)
+; MIPS2-NEXT:    lwr $5, 11($1)
+; MIPS2-NEXT:    andi $2, $2, 7
+; MIPS2-NEXT:    not $6, $2
+; MIPS2-NEXT:    andi $6, $6, 31
+; MIPS2-NEXT:    srlv $7, $5, $2
+; MIPS2-NEXT:    sllv $4, $4, $6
+; MIPS2-NEXT:    srlv $3, $3, $2
+; MIPS2-NEXT:    lwl $6, 0($1)
+; MIPS2-NEXT:    lwr $6, 3($1)
+; MIPS2-NEXT:    sll $8, $6, 1
+; MIPS2-NEXT:    xori $9, $2, 31
+; MIPS2-NEXT:    sllv $8, $8, $9
+; MIPS2-NEXT:    or $3, $3, $8
+; MIPS2-NEXT:    or $4, $7, $4
+; MIPS2-NEXT:    lwl $7, 12($1)
+; MIPS2-NEXT:    lwr $7, 15($1)
+; MIPS2-NEXT:    srlv $1, $7, $2
+; MIPS2-NEXT:    sll $5, $5, 1
+; MIPS2-NEXT:    sllv $5, $5, $9
+; MIPS2-NEXT:    or $5, $1, $5
+; MIPS2-NEXT:    srlv $2, $6, $2
 ; MIPS2-NEXT:    jr $ra
-; MIPS2-NEXT:    move $5, $7
+; MIPS2-NEXT:    addiu $sp, $sp, 32
 ;
 ; MIPS32-LABEL: lshr_i128:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lw $9, 28($sp)
-; MIPS32-NEXT:    addiu $1, $zero, 64
-; MIPS32-NEXT:    subu $2, $1, $9
-; MIPS32-NEXT:    sllv $10, $5, $2
-; MIPS32-NEXT:    andi $11, $2, 32
-; MIPS32-NEXT:    move $1, $10
-; MIPS32-NEXT:    movn $1, $zero, $11
-; MIPS32-NEXT:    srlv $3, $7, $9
-; MIPS32-NEXT:    not $12, $9
+; MIPS32-NEXT:    addiu $sp, $sp, -32
+; MIPS32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32-NEXT:    swl $7, 28($sp)
+; MIPS32-NEXT:    swl $6, 24($sp)
+; MIPS32-NEXT:    swl $5, 20($sp)
+; MIPS32-NEXT:    swl $4, 16($sp)
+; MIPS32-NEXT:    swl $zero, 12($sp)
+; MIPS32-NEXT:    swl $zero, 8($sp)
+; MIPS32-NEXT:    swl $zero, 4($sp)
+; MIPS32-NEXT:    swl $zero, 0($sp)
+; MIPS32-NEXT:    addiu $1, $sp, 0
+; MIPS32-NEXT:    swr $7, 31($sp)
+; MIPS32-NEXT:    swr $6, 27($sp)
+; MIPS32-NEXT:    swr $5, 23($sp)
+; MIPS32-NEXT:    swr $4, 19($sp)
+; MIPS32-NEXT:    swr $zero, 15($sp)
+; MIPS32-NEXT:    swr $zero, 11($sp)
+; MIPS32-NEXT:    swr $zero, 7($sp)
+; MIPS32-NEXT:    swr $zero, 3($sp)
+; MIPS32-NEXT:    addiu $1, $1, 16
+; MIPS32-NEXT:    lw $2, 60($sp)
+; MIPS32-NEXT:    srl $3, $2, 3
+; MIPS32-NEXT:    andi $3, $3, 15
+; MIPS32-NEXT:    subu $1, $1, $3
+; MIPS32-NEXT:    lwl $3, 4($1)
+; MIPS32-NEXT:    lwr $3, 7($1)
+; MIPS32-NEXT:    sll $4, $3, 1
+; MIPS32-NEXT:    lwl $5, 8($1)
+; MIPS32-NEXT:    lwr $5, 11($1)
+; MIPS32-NEXT:    andi $2, $2, 7
+; MIPS32-NEXT:    not $6, $2
+; MIPS32-NEXT:    andi $6, $6, 31
+; MIPS32-NEXT:    srlv $7, $5, $2
+; MIPS32-NEXT:    sllv $4, $4, $6
+; MIPS32-NEXT:    srlv $3, $3, $2
+; MIPS32-NEXT:    lwl $6, 0($1)
+; MIPS32-NEXT:    lwr $6, 3($1)
 ; MIPS32-NEXT:    sll $8, $6, 1
-; MIPS32-NEXT:    sllv $8, $8, $12
-; MIPS32-NEXT:    or $3, $8, $3
-; MIPS32-NEXT:    srlv $13, $6, $9
-; MIPS32-NEXT:    andi $14, $9, 32
-; MIPS32-NEXT:    movn $3, $13, $14
-; MIPS32-NEXT:    addiu $15, $9, -64
-; MIPS32-NEXT:    or $3, $3, $1
-; MIPS32-NEXT:    srlv $1, $5, $15
-; MIPS32-NEXT:    sll $24, $4, 1
-; MIPS32-NEXT:    not $8, $15
-; MIPS32-NEXT:    sllv $8, $24, $8
-; MIPS32-NEXT:    or $1, $8, $1
-; MIPS32-NEXT:    srlv $8, $4, $15
-; MIPS32-NEXT:    andi $15, $15, 32
-; MIPS32-NEXT:    movn $1, $8, $15
-; MIPS32-NEXT:    sltiu $25, $9, 64
-; MIPS32-NEXT:    movn $1, $3, $25
-; MIPS32-NEXT:    sllv $3, $4, $2
-; MIPS32-NEXT:    not $2, $2
-; MIPS32-NEXT:    srl $gp, $5, 1
-; MIPS32-NEXT:    srlv $2, $gp, $2
-; MIPS32-NEXT:    or $gp, $3, $2
-; MIPS32-NEXT:    srlv $2, $5, $9
-; MIPS32-NEXT:    sllv $3, $24, $12
-; MIPS32-NEXT:    or $3, $3, $2
-; MIPS32-NEXT:    srlv $2, $4, $9
-; MIPS32-NEXT:    movn $3, $2, $14
-; MIPS32-NEXT:    movz $1, $7, $9
-; MIPS32-NEXT:    movz $3, $zero, $25
-; MIPS32-NEXT:    movn $gp, $10, $11
-; MIPS32-NEXT:    movn $13, $zero, $14
-; MIPS32-NEXT:    or $4, $13, $gp
-; MIPS32-NEXT:    movn $8, $zero, $15
-; MIPS32-NEXT:    movn $8, $4, $25
-; MIPS32-NEXT:    movz $8, $6, $9
-; MIPS32-NEXT:    movn $2, $zero, $14
-; MIPS32-NEXT:    movz $2, $zero, $25
-; MIPS32-NEXT:    move $4, $8
+; MIPS32-NEXT:    xori $9, $2, 31
+; MIPS32-NEXT:    sllv $8, $8, $9
+; MIPS32-NEXT:    or $3, $3, $8
+; MIPS32-NEXT:    or $4, $7, $4
+; MIPS32-NEXT:    lwl $7, 12($1)
+; MIPS32-NEXT:    lwr $7, 15($1)
+; MIPS32-NEXT:    srlv $1, $7, $2
+; MIPS32-NEXT:    sll $5, $5, 1
+; MIPS32-NEXT:    sllv $5, $5, $9
+; MIPS32-NEXT:    or $5, $1, $5
+; MIPS32-NEXT:    srlv $2, $6, $2
 ; MIPS32-NEXT:    jr $ra
-; MIPS32-NEXT:    move $5, $1
+; MIPS32-NEXT:    addiu $sp, $sp, 32
 ;
 ; MIPS32R2-LABEL: lshr_i128:
 ; MIPS32R2:       # %bb.0: # %entry
-; MIPS32R2-NEXT:    lw $9, 28($sp)
-; MIPS32R2-NEXT:    addiu $1, $zero, 64
-; MIPS32R2-NEXT:    subu $2, $1, $9
-; MIPS32R2-NEXT:    sllv $10, $5, $2
-; MIPS32R2-NEXT:    andi $11, $2, 32
-; MIPS32R2-NEXT:    move $1, $10
-; MIPS32R2-NEXT:    movn $1, $zero, $11
-; MIPS32R2-NEXT:    srlv $3, $7, $9
-; MIPS32R2-NEXT:    not $12, $9
+; MIPS32R2-NEXT:    addiu $sp, $sp, -32
+; MIPS32R2-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32R2-NEXT:    swl $7, 28($sp)
+; MIPS32R2-NEXT:    swl $6, 24($sp)
+; MIPS32R2-NEXT:    swl $5, 20($sp)
+; MIPS32R2-NEXT:    swl $4, 16($sp)
+; MIPS32R2-NEXT:    swl $zero, 12($sp)
+; MIPS32R2-NEXT:    swl $zero, 8($sp)
+; MIPS32R2-NEXT:    swl $zero, 4($sp)
+; MIPS32R2-NEXT:    swl $zero, 0($sp)
+; MIPS32R2-NEXT:    swr $7, 31($sp)
+; MIPS32R2-NEXT:    swr $6, 27($sp)
+; MIPS32R2-NEXT:    swr $5, 23($sp)
+; MIPS32R2-NEXT:    swr $4, 19($sp)
+; MIPS32R2-NEXT:    swr $zero, 15($sp)
+; MIPS32R2-NEXT:    swr $zero, 11($sp)
+; MIPS32R2-NEXT:    swr $zero, 7($sp)
+; MIPS32R2-NEXT:    swr $zero, 3($sp)
+; MIPS32R2-NEXT:    addiu $1, $sp, 0
+; MIPS32R2-NEXT:    addiu $1, $1, 16
+; MIPS32R2-NEXT:    lw $2, 60($sp)
+; MIPS32R2-NEXT:    ext $3, $2, 3, 4
+; MIPS32R2-NEXT:    subu $1, $1, $3
+; MIPS32R2-NEXT:    lwl $3, 4($1)
+; MIPS32R2-NEXT:    lwr $3, 7($1)
+; MIPS32R2-NEXT:    sll $4, $3, 1
+; MIPS32R2-NEXT:    lwl $5, 8($1)
+; MIPS32R2-NEXT:    lwr $5, 11($1)
+; MIPS32R2-NEXT:    andi $2, $2, 7
+; MIPS32R2-NEXT:    not $6, $2
+; MIPS32R2-NEXT:    andi $6, $6, 31
+; MIPS32R2-NEXT:    srlv $7, $5, $2
+; MIPS32R2-NEXT:    sllv $4, $4, $6
+; MIPS32R2-NEXT:    srlv $3, $3, $2
+; MIPS32R2-NEXT:    lwl $6, 0($1)
+; MIPS32R2-NEXT:    lwr $6, 3($1)
 ; MIPS32R2-NEXT:    sll $8, $6, 1
-; MIPS32R2-NEXT:    sllv $8, $8, $12
-; MIPS32R2-NEXT:    or $3, $8, $3
-; MIPS32R2-NEXT:    srlv $13, $6, $9
-; MIPS32R2-NEXT:    andi $14, $9, 32
-; MIPS32R2-NEXT:    movn $3, $13, $14
-; MIPS32R2-NEXT:    addiu $15, $9, -64
-; MIPS32R2-NEXT:    or $3, $3, $1
-; MIPS32R2-NEXT:    srlv $1, $5, $15
-; MIPS32R2-NEXT:    sll $24, $4, 1
-; MIPS32R2-NEXT:    not $8, $15
-; MIPS32R2-NEXT:    sllv $8, $24, $8
-; MIPS32R2-NEXT:    or $1, $8, $1
-; MIPS32R2-NEXT:    srlv $8, $4, $15
-; MIPS32R2-NEXT:    andi $15, $15, 32
-; MIPS32R2-NEXT:    movn $1, $8, $15
-; MIPS32R2-NEXT:    sltiu $25, $9, 64
-; MIPS32R2-NEXT:    movn $1, $3, $25
-; MIPS32R2-NEXT:    sllv $3, $4, $2
-; MIPS32R2-NEXT:    not $2, $2
-; MIPS32R2-NEXT:    srl $gp, $5, 1
-; MIPS32R2-NEXT:    srlv $2, $gp, $2
-; MIPS32R2-NEXT:    or $gp, $3, $2
-; MIPS32R2-NEXT:    srlv $2, $5, $9
-; MIPS32R2-NEXT:    sllv $3, $24, $12
-; MIPS32R2-NEXT:    or $3, $3, $2
-; MIPS32R2-NEXT:    srlv $2, $4, $9
-; MIPS32R2-NEXT:    movn $3, $2, $14
-; MIPS32R2-NEXT:    movz $1, $7, $9
-; MIPS32R2-NEXT:    movz $3, $zero, $25
-; MIPS32R2-NEXT:    movn $gp, $10, $11
-; MIPS32R2-NEXT:    movn $13, $zero, $14
-; MIPS32R2-NEXT:    or $4, $13, $gp
-; MIPS32R2-NEXT:    movn $8, $zero, $15
-; MIPS32R2-NEXT:    movn $8, $4, $25
-; MIPS32R2-NEXT:    movz $8, $6, $9
-; MIPS32R2-NEXT:    movn $2, $zero, $14
-; MIPS32R2-NEXT:    movz $2, $zero, $25
-; MIPS32R2-NEXT:    move $4, $8
+; MIPS32R2-NEXT:    xori $9, $2, 31
+; MIPS32R2-NEXT:    sllv $8, $8, $9
+; MIPS32R2-NEXT:    or $3, $3, $8
+; MIPS32R2-NEXT:    or $4, $7, $4
+; MIPS32R2-NEXT:    lwl $7, 12($1)
+; MIPS32R2-NEXT:    lwr $7, 15($1)
+; MIPS32R2-NEXT:    srlv $1, $7, $2
+; MIPS32R2-NEXT:    sll $5, $5, 1
+; MIPS32R2-NEXT:    sllv $5, $5, $9
+; MIPS32R2-NEXT:    or $5, $1, $5
+; MIPS32R2-NEXT:    srlv $2, $6, $2
 ; MIPS32R2-NEXT:    jr $ra
-; MIPS32R2-NEXT:    move $5, $1
+; MIPS32R2-NEXT:    addiu $sp, $sp, 32
 ;
 ; MIPS32R6-LABEL: lshr_i128:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    addiu $sp, $sp, -8
-; MIPS32R6-NEXT:    .cfi_def_cfa_offset 8
-; MIPS32R6-NEXT:    sw $16, 4($sp) # 4-byte Folded Spill
-; MIPS32R6-NEXT:    .cfi_offset 16, -4
-; MIPS32R6-NEXT:    lw $1, 36($sp)
-; MIPS32R6-NEXT:    srlv $2, $7, $1
-; MIPS32R6-NEXT:    not $3, $1
+; MIPS32R6-NEXT:    addiu $sp, $sp, -32
+; MIPS32R6-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32R6-NEXT:    addiu $1, $sp, 0
+; MIPS32R6-NEXT:    sw $7, 28($sp)
+; MIPS32R6-NEXT:    sw $6, 24($sp)
+; MIPS32R6-NEXT:    sw $5, 20($sp)
+; MIPS32R6-NEXT:    sw $4, 16($sp)
+; MIPS32R6-NEXT:    addiu $1, $1, 16
+; MIPS32R6-NEXT:    lw $2, 60($sp)
+; MIPS32R6-NEXT:    ext $3, $2, 3, 4
+; MIPS32R6-NEXT:    subu $1, $1, $3
+; MIPS32R6-NEXT:    sw $zero, 12($sp)
+; MIPS32R6-NEXT:    sw $zero, 8($sp)
+; MIPS32R6-NEXT:    sw $zero, 4($sp)
+; MIPS32R6-NEXT:    sw $zero, 0($sp)
+; MIPS32R6-NEXT:    lw $3, 4($1)
+; MIPS32R6-NEXT:    sll $4, $3, 1
+; MIPS32R6-NEXT:    lw $5, 8($1)
+; MIPS32R6-NEXT:    andi $2, $2, 7
+; MIPS32R6-NEXT:    not $6, $2
+; MIPS32R6-NEXT:    andi $6, $6, 31
+; MIPS32R6-NEXT:    srlv $7, $5, $2
+; MIPS32R6-NEXT:    sllv $4, $4, $6
+; MIPS32R6-NEXT:    srlv $3, $3, $2
+; MIPS32R6-NEXT:    lw $6, 0($1)
 ; MIPS32R6-NEXT:    sll $8, $6, 1
-; MIPS32R6-NEXT:    sllv $8, $8, $3
-; MIPS32R6-NEXT:    or $2, $8, $2
-; MIPS32R6-NEXT:    addiu $8, $1, -64
-; MIPS32R6-NEXT:    srlv $9, $5, $8
-; MIPS32R6-NEXT:    sll $10, $4, 1
-; MIPS32R6-NEXT:    not $11, $8
-; MIPS32R6-NEXT:    sllv $11, $10, $11
-; MIPS32R6-NEXT:    andi $12, $1, 32
-; MIPS32R6-NEXT:    seleqz $2, $2, $12
-; MIPS32R6-NEXT:    or $9, $11, $9
-; MIPS32R6-NEXT:    srlv $11, $6, $1
-; MIPS32R6-NEXT:    selnez $13, $11, $12
-; MIPS32R6-NEXT:    addiu $14, $zero, 64
-; MIPS32R6-NEXT:    subu $14, $14, $1
-; MIPS32R6-NEXT:    sllv $15, $5, $14
-; MIPS32R6-NEXT:    andi $24, $14, 32
-; MIPS32R6-NEXT:    andi $25, $8, 32
-; MIPS32R6-NEXT:    seleqz $9, $9, $25
-; MIPS32R6-NEXT:    seleqz $gp, $15, $24
-; MIPS32R6-NEXT:    or $2, $13, $2
-; MIPS32R6-NEXT:    selnez $13, $15, $24
-; MIPS32R6-NEXT:    sllv $15, $4, $14
-; MIPS32R6-NEXT:    not $14, $14
-; MIPS32R6-NEXT:    srl $16, $5, 1
-; MIPS32R6-NEXT:    srlv $14, $16, $14
-; MIPS32R6-NEXT:    or $14, $15, $14
-; MIPS32R6-NEXT:    seleqz $14, $14, $24
-; MIPS32R6-NEXT:    srlv $8, $4, $8
-; MIPS32R6-NEXT:    or $13, $13, $14
-; MIPS32R6-NEXT:    or $2, $2, $gp
-; MIPS32R6-NEXT:    srlv $5, $5, $1
-; MIPS32R6-NEXT:    selnez $14, $8, $25
-; MIPS32R6-NEXT:    sltiu $15, $1, 64
-; MIPS32R6-NEXT:    selnez $2, $2, $15
-; MIPS32R6-NEXT:    or $9, $14, $9
-; MIPS32R6-NEXT:    sllv $3, $10, $3
-; MIPS32R6-NEXT:    seleqz $10, $11, $12
-; MIPS32R6-NEXT:    or $10, $10, $13
-; MIPS32R6-NEXT:    or $3, $3, $5
-; MIPS32R6-NEXT:    seleqz $5, $9, $15
-; MIPS32R6-NEXT:    seleqz $9, $zero, $15
-; MIPS32R6-NEXT:    srlv $4, $4, $1
-; MIPS32R6-NEXT:    seleqz $11, $4, $12
-; MIPS32R6-NEXT:    selnez $11, $11, $15
-; MIPS32R6-NEXT:    seleqz $7, $7, $1
-; MIPS32R6-NEXT:    or $2, $2, $5
-; MIPS32R6-NEXT:    selnez $2, $2, $1
-; MIPS32R6-NEXT:    or $5, $7, $2
-; MIPS32R6-NEXT:    or $2, $9, $11
-; MIPS32R6-NEXT:    seleqz $3, $3, $12
-; MIPS32R6-NEXT:    selnez $7, $4, $12
-; MIPS32R6-NEXT:    seleqz $4, $6, $1
-; MIPS32R6-NEXT:    selnez $6, $10, $15
-; MIPS32R6-NEXT:    seleqz $8, $8, $25
-; MIPS32R6-NEXT:    seleqz $8, $8, $15
-; MIPS32R6-NEXT:    or $6, $6, $8
-; MIPS32R6-NEXT:    selnez $1, $6, $1
-; MIPS32R6-NEXT:    or $4, $4, $1
-; MIPS32R6-NEXT:    or $1, $7, $3
-; MIPS32R6-NEXT:    selnez $1, $1, $15
-; MIPS32R6-NEXT:    or $3, $9, $1
-; MIPS32R6-NEXT:    lw $16, 4($sp) # 4-byte Folded Reload
+; MIPS32R6-NEXT:    xori $9, $2, 31
+; MIPS32R6-NEXT:    sllv $8, $8, $9
+; MIPS32R6-NEXT:    or $3, $3, $8
+; MIPS32R6-NEXT:    or $4, $7, $4
+; MIPS32R6-NEXT:    lw $1, 12($1)
+; MIPS32R6-NEXT:    srlv $1, $1, $2
+; MIPS32R6-NEXT:    sll $5, $5, 1
+; MIPS32R6-NEXT:    sllv $5, $5, $9
+; MIPS32R6-NEXT:    or $5, $1, $5
+; MIPS32R6-NEXT:    srlv $2, $6, $2
 ; MIPS32R6-NEXT:    jr $ra
-; MIPS32R6-NEXT:    addiu $sp, $sp, 8
+; MIPS32R6-NEXT:    addiu $sp, $sp, 32
 ;
 ; MIPS3-LABEL: lshr_i128:
 ; MIPS3:       # %bb.0: # %entry
@@ -775,177 +681,100 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MMR3-NEXT:    swp $16, 32($sp)
 ; MMR3-NEXT:    .cfi_offset 17, -4
 ; MMR3-NEXT:    .cfi_offset 16, -8
-; MMR3-NEXT:    move $8, $7
-; MMR3-NEXT:    sw $6, 24($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    sw $4, 28($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    lw $16, 68($sp)
-; MMR3-NEXT:    li16 $2, 64
-; MMR3-NEXT:    subu16 $7, $2, $16
-; MMR3-NEXT:    sllv $9, $5, $7
-; MMR3-NEXT:    move $17, $5
-; MMR3-NEXT:    sw $5, 0($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    andi16 $3, $7, 32
-; MMR3-NEXT:    sw $3, 20($sp) # 4-byte Folded Spill
+; MMR3-NEXT:    swl $7, 28($sp)
+; MMR3-NEXT:    swl $6, 24($sp)
+; MMR3-NEXT:    swl $5, 20($sp)
 ; MMR3-NEXT:    li16 $2, 0
-; MMR3-NEXT:    move $4, $9
-; MMR3-NEXT:    movn $4, $2, $3
-; MMR3-NEXT:    srlv $5, $8, $16
-; MMR3-NEXT:    not16 $3, $16
-; MMR3-NEXT:    sw $3, 16($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    sll16 $2, $6, 1
-; MMR3-NEXT:    sllv $2, $2, $3
-; MMR3-NEXT:    or16 $2, $5
-; MMR3-NEXT:    srlv $5, $6, $16
-; MMR3-NEXT:    sw $5, 4($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    andi16 $3, $16, 32
-; MMR3-NEXT:    sw $3, 12($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    movn $2, $5, $3
-; MMR3-NEXT:    addiu $3, $16, -64
-; MMR3-NEXT:    or16 $2, $4
-; MMR3-NEXT:    srlv $4, $17, $3
-; MMR3-NEXT:    sw $4, 8($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    lw $4, 28($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    sll16 $6, $4, 1
-; MMR3-NEXT:    not16 $5, $3
-; MMR3-NEXT:    sllv $5, $6, $5
-; MMR3-NEXT:    lw $17, 8($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    or16 $5, $17
-; MMR3-NEXT:    srlv $1, $4, $3
-; MMR3-NEXT:    andi16 $3, $3, 32
-; MMR3-NEXT:    sw $3, 8($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    movn $5, $1, $3
-; MMR3-NEXT:    sltiu $10, $16, 64
-; MMR3-NEXT:    movn $5, $2, $10
-; MMR3-NEXT:    sllv $2, $4, $7
-; MMR3-NEXT:    not16 $3, $7
-; MMR3-NEXT:    lw $7, 0($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    srl16 $4, $7, 1
-; MMR3-NEXT:    srlv $4, $4, $3
-; MMR3-NEXT:    or16 $4, $2
-; MMR3-NEXT:    srlv $2, $7, $16
-; MMR3-NEXT:    lw $3, 16($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    sllv $3, $6, $3
-; MMR3-NEXT:    or16 $3, $2
-; MMR3-NEXT:    lw $2, 28($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    srlv $2, $2, $16
-; MMR3-NEXT:    lw $17, 12($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    movn $3, $2, $17
-; MMR3-NEXT:    movz $5, $8, $16
-; MMR3-NEXT:    li16 $6, 0
-; MMR3-NEXT:    movz $3, $6, $10
-; MMR3-NEXT:    lw $7, 20($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    movn $4, $9, $7
-; MMR3-NEXT:    lw $6, 4($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    li16 $7, 0
-; MMR3-NEXT:    movn $6, $7, $17
-; MMR3-NEXT:    or16 $6, $4
-; MMR3-NEXT:    lw $4, 8($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    movn $1, $7, $4
-; MMR3-NEXT:    movn $1, $6, $10
-; MMR3-NEXT:    lw $4, 24($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    movz $1, $4, $16
-; MMR3-NEXT:    movn $2, $7, $17
-; MMR3-NEXT:    li16 $4, 0
-; MMR3-NEXT:    movz $2, $4, $10
-; MMR3-NEXT:    move $4, $1
+; MMR3-NEXT:    swl $4, 16($sp)
+; MMR3-NEXT:    swl $2, 12($sp)
+; MMR3-NEXT:    swl $2, 8($sp)
+; MMR3-NEXT:    swl $2, 4($sp)
+; MMR3-NEXT:    swl $2, 0($sp)
+; MMR3-NEXT:    swr $7, 31($sp)
+; MMR3-NEXT:    swr $6, 27($sp)
+; MMR3-NEXT:    swr $5, 23($sp)
+; MMR3-NEXT:    swr $4, 19($sp)
+; MMR3-NEXT:    swr $2, 15($sp)
+; MMR3-NEXT:    swr $2, 11($sp)
+; MMR3-NEXT:    swr $2, 7($sp)
+; MMR3-NEXT:    swr $2, 3($sp)
+; MMR3-NEXT:    addiur1sp $2, 0
+; MMR3-NEXT:    addiur2 $2, $2, 16
+; MMR3-NEXT:    lw $3, 68($sp)
+; MMR3-NEXT:    ext $4, $3, 3, 4
+; MMR3-NEXT:    subu16 $2, $2, $4
+; MMR3-NEXT:    lwl $7, 4($2)
+; MMR3-NEXT:    lwr $7, 7($2)
+; MMR3-NEXT:    sll16 $4, $7, 1
+; MMR3-NEXT:    lwl $5, 8($2)
+; MMR3-NEXT:    lwr $5, 11($2)
+; MMR3-NEXT:    andi16 $6, $3, 7
+; MMR3-NEXT:    not16 $3, $6
+; MMR3-NEXT:    andi16 $3, $3, 31
+; MMR3-NEXT:    srlv $16, $5, $6
+; MMR3-NEXT:    sllv $4, $4, $3
+; MMR3-NEXT:    srlv $17, $7, $6
+; MMR3-NEXT:    lwl $7, 0($2)
+; MMR3-NEXT:    lwr $7, 3($2)
+; MMR3-NEXT:    sll16 $3, $7, 1
+; MMR3-NEXT:    xori $1, $6, 31
+; MMR3-NEXT:    sllv $3, $3, $1
+; MMR3-NEXT:    or16 $3, $17
+; MMR3-NEXT:    or16 $4, $16
+; MMR3-NEXT:    lwl $8, 12($2)
+; MMR3-NEXT:    lwr $8, 15($2)
+; MMR3-NEXT:    srlv $2, $8, $6
+; MMR3-NEXT:    sll16 $5, $5, 1
+; MMR3-NEXT:    sllv $5, $5, $1
+; MMR3-NEXT:    or16 $5, $2
+; MMR3-NEXT:    srlv $2, $7, $6
 ; MMR3-NEXT:    lwp $16, 32($sp)
 ; MMR3-NEXT:    addiusp 40
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: lshr_i128:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    addiu $sp, $sp, -32
-; MMR6-NEXT:    .cfi_def_cfa_offset 32
-; MMR6-NEXT:    sw $17, 28($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    sw $16, 24($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    .cfi_offset 17, -4
-; MMR6-NEXT:    .cfi_offset 16, -8
-; MMR6-NEXT:    move $1, $7
-; MMR6-NEXT:    move $7, $5
-; MMR6-NEXT:    lw $3, 60($sp)
-; MMR6-NEXT:    srlv $2, $1, $3
-; MMR6-NEXT:    not16 $5, $3
-; MMR6-NEXT:    sw $5, 12($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    move $17, $6
-; MMR6-NEXT:    sw $6, 16($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    sll16 $6, $6, 1
-; MMR6-NEXT:    sllv $6, $6, $5
-; MMR6-NEXT:    or $8, $6, $2
-; MMR6-NEXT:    addiu $5, $3, -64
-; MMR6-NEXT:    srlv $9, $7, $5
-; MMR6-NEXT:    move $6, $4
-; MMR6-NEXT:    sll16 $2, $4, 1
-; MMR6-NEXT:    sw $2, 8($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    not16 $16, $5
-; MMR6-NEXT:    sllv $10, $2, $16
-; MMR6-NEXT:    andi16 $16, $3, 32
-; MMR6-NEXT:    seleqz $8, $8, $16
-; MMR6-NEXT:    or $9, $10, $9
-; MMR6-NEXT:    srlv $10, $17, $3
-; MMR6-NEXT:    selnez $11, $10, $16
-; MMR6-NEXT:    li16 $17, 64
-; MMR6-NEXT:    subu16 $2, $17, $3
-; MMR6-NEXT:    sllv $12, $7, $2
-; MMR6-NEXT:    move $17, $7
-; MMR6-NEXT:    andi16 $4, $2, 32
-; MMR6-NEXT:    andi16 $7, $5, 32
-; MMR6-NEXT:    sw $7, 20($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    seleqz $9, $9, $7
-; MMR6-NEXT:    seleqz $13, $12, $4
-; MMR6-NEXT:    or $8, $11, $8
-; MMR6-NEXT:    selnez $11, $12, $4
-; MMR6-NEXT:    sllv $12, $6, $2
-; MMR6-NEXT:    move $7, $6
-; MMR6-NEXT:    sw $6, 4($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    not16 $2, $2
-; MMR6-NEXT:    srl16 $6, $17, 1
-; MMR6-NEXT:    srlv $2, $6, $2
-; MMR6-NEXT:    or $2, $12, $2
-; MMR6-NEXT:    seleqz $2, $2, $4
-; MMR6-NEXT:    srlv $4, $7, $5
-; MMR6-NEXT:    or $11, $11, $2
-; MMR6-NEXT:    or $5, $8, $13
-; MMR6-NEXT:    srlv $6, $17, $3
-; MMR6-NEXT:    lw $2, 20($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    selnez $7, $4, $2
-; MMR6-NEXT:    sltiu $8, $3, 64
-; MMR6-NEXT:    selnez $12, $5, $8
-; MMR6-NEXT:    or $7, $7, $9
-; MMR6-NEXT:    lw $5, 12($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    lw $2, 8($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    sllv $9, $2, $5
-; MMR6-NEXT:    seleqz $10, $10, $16
-; MMR6-NEXT:    li16 $5, 0
-; MMR6-NEXT:    or $10, $10, $11
-; MMR6-NEXT:    or $6, $9, $6
-; MMR6-NEXT:    seleqz $2, $7, $8
-; MMR6-NEXT:    seleqz $7, $5, $8
-; MMR6-NEXT:    lw $5, 4($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    srlv $9, $5, $3
-; MMR6-NEXT:    seleqz $11, $9, $16
-; MMR6-NEXT:    selnez $11, $11, $8
-; MMR6-NEXT:    seleqz $1, $1, $3
-; MMR6-NEXT:    or $2, $12, $2
-; MMR6-NEXT:    selnez $2, $2, $3
-; MMR6-NEXT:    or $5, $1, $2
-; MMR6-NEXT:    or $2, $7, $11
-; MMR6-NEXT:    seleqz $1, $6, $16
-; MMR6-NEXT:    selnez $6, $9, $16
-; MMR6-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    seleqz $9, $16, $3
-; MMR6-NEXT:    selnez $10, $10, $8
-; MMR6-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    seleqz $4, $4, $16
-; MMR6-NEXT:    seleqz $4, $4, $8
-; MMR6-NEXT:    or $4, $10, $4
-; MMR6-NEXT:    selnez $3, $4, $3
-; MMR6-NEXT:    or $4, $9, $3
-; MMR6-NEXT:    or $1, $6, $1
-; MMR6-NEXT:    selnez $1, $1, $8
-; MMR6-NEXT:    or $3, $7, $1
-; MMR6-NEXT:    lw $16, 24($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    lw $17, 28($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    addiu $sp, $sp, 32
+; MMR6-NEXT:    addiu $sp, $sp, -40
+; MMR6-NEXT:    .cfi_def_cfa_offset 40
+; MMR6-NEXT:    sw $16, 36($sp) # 4-byte Folded Spill
+; MMR6-NEXT:    .cfi_offset 16, -4
+; MMR6-NEXT:    li16 $2, 0
+; MMR6-NEXT:    sw $7, 32($sp)
+; MMR6-NEXT:    sw $6, 28($sp)
+; MMR6-NEXT:    sw $5, 24($sp)
+; MMR6-NEXT:    sw $4, 20($sp)
+; MMR6-NEXT:    sw $2, 16($sp)
+; MMR6-NEXT:    sw $2, 12($sp)
+; MMR6-NEXT:    sw $2, 8($sp)
+; MMR6-NEXT:    sw $2, 4($sp)
+; MMR6-NEXT:    addiu $2, $sp, 4
+; MMR6-NEXT:    addiur2 $2, $2, 16
+; MMR6-NEXT:    lw $3, 68($sp)
+; MMR6-NEXT:    ext $4, $3, 3, 4
+; MMR6-NEXT:    subu16 $5, $2, $4
+; MMR6-NEXT:    lw16 $4, 4($5)
+; MMR6-NEXT:    sll16 $6, $4, 1
+; MMR6-NEXT:    lw16 $7, 8($5)
+; MMR6-NEXT:    andi16 $2, $3, 7
+; MMR6-NEXT:    not16 $3, $2
+; MMR6-NEXT:    andi16 $3, $3, 31
+; MMR6-NEXT:    srlv $1, $7, $2
+; MMR6-NEXT:    sllv $6, $6, $3
+; MMR6-NEXT:    srlv $3, $4, $2
+; MMR6-NEXT:    lw16 $16, 0($5)
+; MMR6-NEXT:    sll16 $4, $16, 1
+; MMR6-NEXT:    xori $8, $2, 31
+; MMR6-NEXT:    sllv $4, $4, $8
+; MMR6-NEXT:    or $3, $3, $4
+; MMR6-NEXT:    or $4, $1, $6
+; MMR6-NEXT:    lw16 $5, 12($5)
+; MMR6-NEXT:    srlv $1, $5, $2
+; MMR6-NEXT:    sll16 $5, $7, 1
+; MMR6-NEXT:    sllv $5, $5, $8
+; MMR6-NEXT:    or $5, $1, $5
+; MMR6-NEXT:    srlv $2, $16, $2
+; MMR6-NEXT:    lw $16, 36($sp) # 4-byte Folded Reload
+; MMR6-NEXT:    addiu $sp, $sp, 40
 ; MMR6-NEXT:    jrc $ra
 entry:
 

diff  --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
index b47a1f8f1a4e9..77f9f0ed646ee 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -456,307 +456,201 @@ entry:
 define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS2-LABEL: shl_i128:
 ; MIPS2:       # %bb.0: # %entry
-; MIPS2-NEXT:    addiu $sp, $sp, -8
-; MIPS2-NEXT:    .cfi_def_cfa_offset 8
-; MIPS2-NEXT:    sw $17, 4($sp) # 4-byte Folded Spill
-; MIPS2-NEXT:    sw $16, 0($sp) # 4-byte Folded Spill
-; MIPS2-NEXT:    .cfi_offset 17, -4
-; MIPS2-NEXT:    .cfi_offset 16, -8
-; MIPS2-NEXT:    lw $8, 36($sp)
-; MIPS2-NEXT:    addiu $1, $zero, 64
-; MIPS2-NEXT:    subu $3, $1, $8
-; MIPS2-NEXT:    srlv $9, $6, $3
-; MIPS2-NEXT:    andi $1, $3, 32
-; MIPS2-NEXT:    bnez $1, $BB5_2
-; MIPS2-NEXT:    addiu $2, $zero, 0
-; MIPS2-NEXT:  # %bb.1: # %entry
-; MIPS2-NEXT:    srlv $1, $7, $3
-; MIPS2-NEXT:    not $3, $3
-; MIPS2-NEXT:    sll $10, $6, 1
-; MIPS2-NEXT:    sllv $3, $10, $3
-; MIPS2-NEXT:    or $3, $3, $1
-; MIPS2-NEXT:    b $BB5_3
-; MIPS2-NEXT:    move $15, $9
-; MIPS2-NEXT:  $BB5_2:
-; MIPS2-NEXT:    addiu $15, $zero, 0
-; MIPS2-NEXT:    move $3, $9
-; MIPS2-NEXT:  $BB5_3: # %entry
-; MIPS2-NEXT:    not $13, $8
-; MIPS2-NEXT:    sllv $9, $5, $8
-; MIPS2-NEXT:    andi $10, $8, 32
-; MIPS2-NEXT:    bnez $10, $BB5_5
-; MIPS2-NEXT:    move $25, $9
-; MIPS2-NEXT:  # %bb.4: # %entry
-; MIPS2-NEXT:    sllv $1, $4, $8
-; MIPS2-NEXT:    srl $11, $5, 1
-; MIPS2-NEXT:    srlv $11, $11, $13
-; MIPS2-NEXT:    or $25, $1, $11
-; MIPS2-NEXT:  $BB5_5: # %entry
-; MIPS2-NEXT:    addiu $14, $8, -64
-; MIPS2-NEXT:    srl $24, $7, 1
-; MIPS2-NEXT:    sllv $11, $7, $14
-; MIPS2-NEXT:    andi $12, $14, 32
-; MIPS2-NEXT:    bnez $12, $BB5_7
-; MIPS2-NEXT:    move $gp, $11
-; MIPS2-NEXT:  # %bb.6: # %entry
-; MIPS2-NEXT:    sllv $1, $6, $14
-; MIPS2-NEXT:    not $14, $14
-; MIPS2-NEXT:    srlv $14, $24, $14
-; MIPS2-NEXT:    or $gp, $1, $14
-; MIPS2-NEXT:  $BB5_7: # %entry
-; MIPS2-NEXT:    sltiu $14, $8, 64
-; MIPS2-NEXT:    beqz $14, $BB5_9
-; MIPS2-NEXT:    nop
-; MIPS2-NEXT:  # %bb.8:
-; MIPS2-NEXT:    or $gp, $25, $15
-; MIPS2-NEXT:  $BB5_9: # %entry
-; MIPS2-NEXT:    sllv $25, $7, $8
-; MIPS2-NEXT:    bnez $10, $BB5_11
-; MIPS2-NEXT:    addiu $17, $zero, 0
-; MIPS2-NEXT:  # %bb.10: # %entry
-; MIPS2-NEXT:    move $17, $25
-; MIPS2-NEXT:  $BB5_11: # %entry
-; MIPS2-NEXT:    addiu $1, $zero, 63
-; MIPS2-NEXT:    sltiu $15, $8, 1
-; MIPS2-NEXT:    beqz $15, $BB5_21
-; MIPS2-NEXT:    sltu $16, $1, $8
-; MIPS2-NEXT:  # %bb.12: # %entry
-; MIPS2-NEXT:    beqz $16, $BB5_22
-; MIPS2-NEXT:    addiu $7, $zero, 0
-; MIPS2-NEXT:  $BB5_13: # %entry
-; MIPS2-NEXT:    beqz $10, $BB5_23
-; MIPS2-NEXT:    nop
-; MIPS2-NEXT:  $BB5_14: # %entry
-; MIPS2-NEXT:    beqz $16, $BB5_24
-; MIPS2-NEXT:    addiu $6, $zero, 0
-; MIPS2-NEXT:  $BB5_15: # %entry
-; MIPS2-NEXT:    beqz $10, $BB5_25
-; MIPS2-NEXT:    addiu $8, $zero, 0
-; MIPS2-NEXT:  $BB5_16: # %entry
-; MIPS2-NEXT:    beqz $12, $BB5_26
-; MIPS2-NEXT:    nop
-; MIPS2-NEXT:  $BB5_17: # %entry
-; MIPS2-NEXT:    bnez $14, $BB5_27
-; MIPS2-NEXT:    nop
-; MIPS2-NEXT:  $BB5_18: # %entry
-; MIPS2-NEXT:    bnez $15, $BB5_20
-; MIPS2-NEXT:    nop
-; MIPS2-NEXT:  $BB5_19: # %entry
-; MIPS2-NEXT:    move $5, $2
-; MIPS2-NEXT:  $BB5_20: # %entry
-; MIPS2-NEXT:    move $2, $4
-; MIPS2-NEXT:    move $3, $5
-; MIPS2-NEXT:    move $4, $6
-; MIPS2-NEXT:    move $5, $7
-; MIPS2-NEXT:    lw $16, 0($sp) # 4-byte Folded Reload
-; MIPS2-NEXT:    lw $17, 4($sp) # 4-byte Folded Reload
+; MIPS2-NEXT:    addiu $sp, $sp, -32
+; MIPS2-NEXT:    .cfi_def_cfa_offset 32
+; MIPS2-NEXT:    swl $zero, 28($sp)
+; MIPS2-NEXT:    swl $zero, 24($sp)
+; MIPS2-NEXT:    swl $zero, 20($sp)
+; MIPS2-NEXT:    swl $zero, 16($sp)
+; MIPS2-NEXT:    swl $7, 12($sp)
+; MIPS2-NEXT:    swl $6, 8($sp)
+; MIPS2-NEXT:    swl $5, 4($sp)
+; MIPS2-NEXT:    swl $4, 0($sp)
+; MIPS2-NEXT:    swr $zero, 31($sp)
+; MIPS2-NEXT:    swr $zero, 27($sp)
+; MIPS2-NEXT:    swr $zero, 23($sp)
+; MIPS2-NEXT:    swr $zero, 19($sp)
+; MIPS2-NEXT:    swr $7, 15($sp)
+; MIPS2-NEXT:    swr $6, 11($sp)
+; MIPS2-NEXT:    swr $5, 7($sp)
+; MIPS2-NEXT:    swr $4, 3($sp)
+; MIPS2-NEXT:    lw $1, 60($sp)
+; MIPS2-NEXT:    srl $2, $1, 3
+; MIPS2-NEXT:    andi $2, $2, 15
+; MIPS2-NEXT:    addiu $3, $sp, 0
+; MIPS2-NEXT:    addu $4, $3, $2
+; MIPS2-NEXT:    lwl $5, 8($4)
+; MIPS2-NEXT:    lwr $5, 11($4)
+; MIPS2-NEXT:    srl $2, $5, 1
+; MIPS2-NEXT:    lwl $3, 4($4)
+; MIPS2-NEXT:    lwr $3, 7($4)
+; MIPS2-NEXT:    andi $1, $1, 7
+; MIPS2-NEXT:    not $6, $1
+; MIPS2-NEXT:    andi $6, $6, 31
+; MIPS2-NEXT:    sllv $7, $3, $1
+; MIPS2-NEXT:    srlv $6, $2, $6
+; MIPS2-NEXT:    lwl $2, 0($4)
+; MIPS2-NEXT:    lwr $2, 3($4)
+; MIPS2-NEXT:    sllv $2, $2, $1
+; MIPS2-NEXT:    srl $3, $3, 1
+; MIPS2-NEXT:    xori $8, $1, 31
+; MIPS2-NEXT:    srlv $3, $3, $8
+; MIPS2-NEXT:    or $2, $2, $3
+; MIPS2-NEXT:    or $3, $7, $6
+; MIPS2-NEXT:    sllv $5, $5, $1
+; MIPS2-NEXT:    lwl $6, 12($4)
+; MIPS2-NEXT:    lwr $6, 15($4)
+; MIPS2-NEXT:    srl $4, $6, 1
+; MIPS2-NEXT:    srlv $4, $4, $8
+; MIPS2-NEXT:    or $4, $5, $4
+; MIPS2-NEXT:    sllv $5, $6, $1
 ; MIPS2-NEXT:    jr $ra
-; MIPS2-NEXT:    addiu $sp, $sp, 8
-; MIPS2-NEXT:  $BB5_21: # %entry
-; MIPS2-NEXT:    move $4, $gp
-; MIPS2-NEXT:    bnez $16, $BB5_13
-; MIPS2-NEXT:    addiu $7, $zero, 0
-; MIPS2-NEXT:  $BB5_22: # %entry
-; MIPS2-NEXT:    bnez $10, $BB5_14
-; MIPS2-NEXT:    move $7, $17
-; MIPS2-NEXT:  $BB5_23: # %entry
-; MIPS2-NEXT:    sllv $1, $6, $8
-; MIPS2-NEXT:    srlv $6, $24, $13
-; MIPS2-NEXT:    or $25, $1, $6
-; MIPS2-NEXT:    bnez $16, $BB5_15
-; MIPS2-NEXT:    addiu $6, $zero, 0
-; MIPS2-NEXT:  $BB5_24: # %entry
-; MIPS2-NEXT:    move $6, $25
-; MIPS2-NEXT:    bnez $10, $BB5_16
-; MIPS2-NEXT:    addiu $8, $zero, 0
-; MIPS2-NEXT:  $BB5_25: # %entry
-; MIPS2-NEXT:    bnez $12, $BB5_17
-; MIPS2-NEXT:    move $8, $9
-; MIPS2-NEXT:  $BB5_26: # %entry
-; MIPS2-NEXT:    beqz $14, $BB5_18
-; MIPS2-NEXT:    move $2, $11
-; MIPS2-NEXT:  $BB5_27:
-; MIPS2-NEXT:    bnez $15, $BB5_20
-; MIPS2-NEXT:    or $2, $8, $3
-; MIPS2-NEXT:  # %bb.28:
-; MIPS2-NEXT:    b $BB5_19
-; MIPS2-NEXT:    nop
+; MIPS2-NEXT:    addiu $sp, $sp, 32
 ;
 ; MIPS32-LABEL: shl_i128:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lw $8, 28($sp)
-; MIPS32-NEXT:    addiu $1, $zero, 64
-; MIPS32-NEXT:    subu $1, $1, $8
-; MIPS32-NEXT:    srlv $9, $6, $1
-; MIPS32-NEXT:    andi $10, $1, 32
-; MIPS32-NEXT:    move $2, $9
-; MIPS32-NEXT:    movn $2, $zero, $10
-; MIPS32-NEXT:    sllv $3, $4, $8
-; MIPS32-NEXT:    not $11, $8
-; MIPS32-NEXT:    srl $12, $5, 1
-; MIPS32-NEXT:    srlv $12, $12, $11
-; MIPS32-NEXT:    or $3, $3, $12
-; MIPS32-NEXT:    sllv $12, $5, $8
-; MIPS32-NEXT:    andi $13, $8, 32
-; MIPS32-NEXT:    movn $3, $12, $13
-; MIPS32-NEXT:    addiu $14, $8, -64
-; MIPS32-NEXT:    or $15, $3, $2
-; MIPS32-NEXT:    sllv $2, $6, $14
-; MIPS32-NEXT:    srl $24, $7, 1
-; MIPS32-NEXT:    not $3, $14
-; MIPS32-NEXT:    srlv $3, $24, $3
+; MIPS32-NEXT:    addiu $sp, $sp, -32
+; MIPS32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32-NEXT:    swl $zero, 28($sp)
+; MIPS32-NEXT:    swl $zero, 24($sp)
+; MIPS32-NEXT:    swl $zero, 20($sp)
+; MIPS32-NEXT:    swl $zero, 16($sp)
+; MIPS32-NEXT:    swl $7, 12($sp)
+; MIPS32-NEXT:    swl $6, 8($sp)
+; MIPS32-NEXT:    swl $5, 4($sp)
+; MIPS32-NEXT:    swl $4, 0($sp)
+; MIPS32-NEXT:    swr $zero, 31($sp)
+; MIPS32-NEXT:    swr $zero, 27($sp)
+; MIPS32-NEXT:    swr $zero, 23($sp)
+; MIPS32-NEXT:    swr $zero, 19($sp)
+; MIPS32-NEXT:    swr $7, 15($sp)
+; MIPS32-NEXT:    swr $6, 11($sp)
+; MIPS32-NEXT:    swr $5, 7($sp)
+; MIPS32-NEXT:    swr $4, 3($sp)
+; MIPS32-NEXT:    lw $1, 60($sp)
+; MIPS32-NEXT:    srl $2, $1, 3
+; MIPS32-NEXT:    andi $2, $2, 15
+; MIPS32-NEXT:    addiu $3, $sp, 0
+; MIPS32-NEXT:    addu $4, $3, $2
+; MIPS32-NEXT:    lwl $5, 8($4)
+; MIPS32-NEXT:    lwr $5, 11($4)
+; MIPS32-NEXT:    srl $2, $5, 1
+; MIPS32-NEXT:    lwl $3, 4($4)
+; MIPS32-NEXT:    lwr $3, 7($4)
+; MIPS32-NEXT:    andi $1, $1, 7
+; MIPS32-NEXT:    not $6, $1
+; MIPS32-NEXT:    andi $6, $6, 31
+; MIPS32-NEXT:    sllv $7, $3, $1
+; MIPS32-NEXT:    srlv $6, $2, $6
+; MIPS32-NEXT:    lwl $2, 0($4)
+; MIPS32-NEXT:    lwr $2, 3($4)
+; MIPS32-NEXT:    sllv $2, $2, $1
+; MIPS32-NEXT:    srl $3, $3, 1
+; MIPS32-NEXT:    xori $8, $1, 31
+; MIPS32-NEXT:    srlv $3, $3, $8
 ; MIPS32-NEXT:    or $2, $2, $3
-; MIPS32-NEXT:    sllv $3, $7, $14
-; MIPS32-NEXT:    andi $14, $14, 32
-; MIPS32-NEXT:    movn $2, $3, $14
-; MIPS32-NEXT:    sltiu $25, $8, 64
-; MIPS32-NEXT:    movn $2, $15, $25
-; MIPS32-NEXT:    srlv $15, $7, $1
-; MIPS32-NEXT:    not $1, $1
-; MIPS32-NEXT:    sll $gp, $6, 1
-; MIPS32-NEXT:    sllv $1, $gp, $1
-; MIPS32-NEXT:    or $15, $1, $15
-; MIPS32-NEXT:    sllv $1, $6, $8
-; MIPS32-NEXT:    srlv $6, $24, $11
-; MIPS32-NEXT:    or $1, $1, $6
-; MIPS32-NEXT:    sllv $6, $7, $8
-; MIPS32-NEXT:    movn $1, $6, $13
-; MIPS32-NEXT:    movz $2, $4, $8
-; MIPS32-NEXT:    movz $1, $zero, $25
-; MIPS32-NEXT:    movn $15, $9, $10
-; MIPS32-NEXT:    movn $12, $zero, $13
-; MIPS32-NEXT:    or $4, $12, $15
-; MIPS32-NEXT:    movn $3, $zero, $14
-; MIPS32-NEXT:    movn $3, $4, $25
-; MIPS32-NEXT:    movz $3, $5, $8
-; MIPS32-NEXT:    movn $6, $zero, $13
-; MIPS32-NEXT:    movz $6, $zero, $25
-; MIPS32-NEXT:    move $4, $1
+; MIPS32-NEXT:    or $3, $7, $6
+; MIPS32-NEXT:    sllv $5, $5, $1
+; MIPS32-NEXT:    lwl $6, 12($4)
+; MIPS32-NEXT:    lwr $6, 15($4)
+; MIPS32-NEXT:    srl $4, $6, 1
+; MIPS32-NEXT:    srlv $4, $4, $8
+; MIPS32-NEXT:    or $4, $5, $4
+; MIPS32-NEXT:    sllv $5, $6, $1
 ; MIPS32-NEXT:    jr $ra
-; MIPS32-NEXT:    move $5, $6
+; MIPS32-NEXT:    addiu $sp, $sp, 32
 ;
 ; MIPS32R2-LABEL: shl_i128:
 ; MIPS32R2:       # %bb.0: # %entry
-; MIPS32R2-NEXT:    lw $8, 28($sp)
-; MIPS32R2-NEXT:    addiu $1, $zero, 64
-; MIPS32R2-NEXT:    subu $1, $1, $8
-; MIPS32R2-NEXT:    srlv $9, $6, $1
-; MIPS32R2-NEXT:    andi $10, $1, 32
-; MIPS32R2-NEXT:    move $2, $9
-; MIPS32R2-NEXT:    movn $2, $zero, $10
-; MIPS32R2-NEXT:    sllv $3, $4, $8
-; MIPS32R2-NEXT:    not $11, $8
-; MIPS32R2-NEXT:    srl $12, $5, 1
-; MIPS32R2-NEXT:    srlv $12, $12, $11
-; MIPS32R2-NEXT:    or $3, $3, $12
-; MIPS32R2-NEXT:    sllv $12, $5, $8
-; MIPS32R2-NEXT:    andi $13, $8, 32
-; MIPS32R2-NEXT:    movn $3, $12, $13
-; MIPS32R2-NEXT:    addiu $14, $8, -64
-; MIPS32R2-NEXT:    or $15, $3, $2
-; MIPS32R2-NEXT:    sllv $2, $6, $14
-; MIPS32R2-NEXT:    srl $24, $7, 1
-; MIPS32R2-NEXT:    not $3, $14
-; MIPS32R2-NEXT:    srlv $3, $24, $3
+; MIPS32R2-NEXT:    addiu $sp, $sp, -32
+; MIPS32R2-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32R2-NEXT:    swl $zero, 28($sp)
+; MIPS32R2-NEXT:    swl $zero, 24($sp)
+; MIPS32R2-NEXT:    swl $zero, 20($sp)
+; MIPS32R2-NEXT:    swl $zero, 16($sp)
+; MIPS32R2-NEXT:    swl $7, 12($sp)
+; MIPS32R2-NEXT:    swl $6, 8($sp)
+; MIPS32R2-NEXT:    swl $5, 4($sp)
+; MIPS32R2-NEXT:    swl $4, 0($sp)
+; MIPS32R2-NEXT:    swr $zero, 31($sp)
+; MIPS32R2-NEXT:    swr $zero, 27($sp)
+; MIPS32R2-NEXT:    swr $zero, 23($sp)
+; MIPS32R2-NEXT:    swr $zero, 19($sp)
+; MIPS32R2-NEXT:    swr $7, 15($sp)
+; MIPS32R2-NEXT:    swr $6, 11($sp)
+; MIPS32R2-NEXT:    swr $5, 7($sp)
+; MIPS32R2-NEXT:    swr $4, 3($sp)
+; MIPS32R2-NEXT:    lw $1, 60($sp)
+; MIPS32R2-NEXT:    ext $2, $1, 3, 4
+; MIPS32R2-NEXT:    addiu $3, $sp, 0
+; MIPS32R2-NEXT:    addu $4, $3, $2
+; MIPS32R2-NEXT:    lwl $5, 8($4)
+; MIPS32R2-NEXT:    lwr $5, 11($4)
+; MIPS32R2-NEXT:    srl $2, $5, 1
+; MIPS32R2-NEXT:    lwl $3, 4($4)
+; MIPS32R2-NEXT:    lwr $3, 7($4)
+; MIPS32R2-NEXT:    andi $1, $1, 7
+; MIPS32R2-NEXT:    not $6, $1
+; MIPS32R2-NEXT:    andi $6, $6, 31
+; MIPS32R2-NEXT:    sllv $7, $3, $1
+; MIPS32R2-NEXT:    srlv $6, $2, $6
+; MIPS32R2-NEXT:    lwl $2, 0($4)
+; MIPS32R2-NEXT:    lwr $2, 3($4)
+; MIPS32R2-NEXT:    sllv $2, $2, $1
+; MIPS32R2-NEXT:    srl $3, $3, 1
+; MIPS32R2-NEXT:    xori $8, $1, 31
+; MIPS32R2-NEXT:    srlv $3, $3, $8
 ; MIPS32R2-NEXT:    or $2, $2, $3
-; MIPS32R2-NEXT:    sllv $3, $7, $14
-; MIPS32R2-NEXT:    andi $14, $14, 32
-; MIPS32R2-NEXT:    movn $2, $3, $14
-; MIPS32R2-NEXT:    sltiu $25, $8, 64
-; MIPS32R2-NEXT:    movn $2, $15, $25
-; MIPS32R2-NEXT:    srlv $15, $7, $1
-; MIPS32R2-NEXT:    not $1, $1
-; MIPS32R2-NEXT:    sll $gp, $6, 1
-; MIPS32R2-NEXT:    sllv $1, $gp, $1
-; MIPS32R2-NEXT:    or $15, $1, $15
-; MIPS32R2-NEXT:    sllv $1, $6, $8
-; MIPS32R2-NEXT:    srlv $6, $24, $11
-; MIPS32R2-NEXT:    or $1, $1, $6
-; MIPS32R2-NEXT:    sllv $6, $7, $8
-; MIPS32R2-NEXT:    movn $1, $6, $13
-; MIPS32R2-NEXT:    movz $2, $4, $8
-; MIPS32R2-NEXT:    movz $1, $zero, $25
-; MIPS32R2-NEXT:    movn $15, $9, $10
-; MIPS32R2-NEXT:    movn $12, $zero, $13
-; MIPS32R2-NEXT:    or $4, $12, $15
-; MIPS32R2-NEXT:    movn $3, $zero, $14
-; MIPS32R2-NEXT:    movn $3, $4, $25
-; MIPS32R2-NEXT:    movz $3, $5, $8
-; MIPS32R2-NEXT:    movn $6, $zero, $13
-; MIPS32R2-NEXT:    movz $6, $zero, $25
-; MIPS32R2-NEXT:    move $4, $1
+; MIPS32R2-NEXT:    or $3, $7, $6
+; MIPS32R2-NEXT:    sllv $5, $5, $1
+; MIPS32R2-NEXT:    lwl $6, 12($4)
+; MIPS32R2-NEXT:    lwr $6, 15($4)
+; MIPS32R2-NEXT:    srl $4, $6, 1
+; MIPS32R2-NEXT:    srlv $4, $4, $8
+; MIPS32R2-NEXT:    or $4, $5, $4
+; MIPS32R2-NEXT:    sllv $5, $6, $1
 ; MIPS32R2-NEXT:    jr $ra
-; MIPS32R2-NEXT:    move $5, $6
+; MIPS32R2-NEXT:    addiu $sp, $sp, 32
 ;
 ; MIPS32R6-LABEL: shl_i128:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lw $3, 28($sp)
-; MIPS32R6-NEXT:    sllv $1, $4, $3
-; MIPS32R6-NEXT:    not $2, $3
-; MIPS32R6-NEXT:    srl $8, $5, 1
-; MIPS32R6-NEXT:    srlv $8, $8, $2
-; MIPS32R6-NEXT:    or $1, $1, $8
-; MIPS32R6-NEXT:    sllv $8, $5, $3
-; MIPS32R6-NEXT:    andi $9, $3, 32
-; MIPS32R6-NEXT:    seleqz $1, $1, $9
-; MIPS32R6-NEXT:    selnez $10, $8, $9
-; MIPS32R6-NEXT:    addiu $11, $zero, 64
-; MIPS32R6-NEXT:    subu $11, $11, $3
-; MIPS32R6-NEXT:    srlv $12, $6, $11
-; MIPS32R6-NEXT:    andi $13, $11, 32
-; MIPS32R6-NEXT:    seleqz $14, $12, $13
-; MIPS32R6-NEXT:    or $1, $10, $1
-; MIPS32R6-NEXT:    selnez $10, $12, $13
-; MIPS32R6-NEXT:    srlv $12, $7, $11
-; MIPS32R6-NEXT:    not $11, $11
-; MIPS32R6-NEXT:    sll $15, $6, 1
-; MIPS32R6-NEXT:    sllv $11, $15, $11
-; MIPS32R6-NEXT:    or $11, $11, $12
-; MIPS32R6-NEXT:    seleqz $11, $11, $13
-; MIPS32R6-NEXT:    addiu $12, $3, -64
-; MIPS32R6-NEXT:    or $10, $10, $11
-; MIPS32R6-NEXT:    or $1, $1, $14
-; MIPS32R6-NEXT:    sllv $11, $6, $12
-; MIPS32R6-NEXT:    srl $13, $7, 1
-; MIPS32R6-NEXT:    not $14, $12
-; MIPS32R6-NEXT:    srlv $14, $13, $14
-; MIPS32R6-NEXT:    or $11, $11, $14
-; MIPS32R6-NEXT:    andi $14, $12, 32
-; MIPS32R6-NEXT:    seleqz $11, $11, $14
-; MIPS32R6-NEXT:    sllv $12, $7, $12
-; MIPS32R6-NEXT:    selnez $15, $12, $14
-; MIPS32R6-NEXT:    sltiu $24, $3, 64
-; MIPS32R6-NEXT:    selnez $1, $1, $24
-; MIPS32R6-NEXT:    or $11, $15, $11
-; MIPS32R6-NEXT:    sllv $6, $6, $3
-; MIPS32R6-NEXT:    srlv $2, $13, $2
-; MIPS32R6-NEXT:    seleqz $8, $8, $9
-; MIPS32R6-NEXT:    or $8, $8, $10
-; MIPS32R6-NEXT:    or $6, $6, $2
-; MIPS32R6-NEXT:    seleqz $2, $11, $24
-; MIPS32R6-NEXT:    seleqz $10, $zero, $24
-; MIPS32R6-NEXT:    sllv $7, $7, $3
-; MIPS32R6-NEXT:    seleqz $11, $7, $9
-; MIPS32R6-NEXT:    selnez $11, $11, $24
-; MIPS32R6-NEXT:    seleqz $4, $4, $3
-; MIPS32R6-NEXT:    or $1, $1, $2
-; MIPS32R6-NEXT:    selnez $1, $1, $3
-; MIPS32R6-NEXT:    or $2, $4, $1
-; MIPS32R6-NEXT:    or $1, $10, $11
-; MIPS32R6-NEXT:    seleqz $4, $6, $9
-; MIPS32R6-NEXT:    selnez $6, $7, $9
-; MIPS32R6-NEXT:    seleqz $5, $5, $3
-; MIPS32R6-NEXT:    selnez $7, $8, $24
-; MIPS32R6-NEXT:    seleqz $8, $12, $14
-; MIPS32R6-NEXT:    seleqz $8, $8, $24
-; MIPS32R6-NEXT:    or $7, $7, $8
-; MIPS32R6-NEXT:    selnez $3, $7, $3
-; MIPS32R6-NEXT:    or $3, $5, $3
-; MIPS32R6-NEXT:    or $4, $6, $4
-; MIPS32R6-NEXT:    selnez $4, $4, $24
-; MIPS32R6-NEXT:    or $4, $10, $4
+; MIPS32R6-NEXT:    addiu $sp, $sp, -32
+; MIPS32R6-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32R6-NEXT:    lw $1, 60($sp)
+; MIPS32R6-NEXT:    sw $7, 12($sp)
+; MIPS32R6-NEXT:    sw $6, 8($sp)
+; MIPS32R6-NEXT:    sw $5, 4($sp)
+; MIPS32R6-NEXT:    sw $4, 0($sp)
+; MIPS32R6-NEXT:    ext $2, $1, 3, 4
+; MIPS32R6-NEXT:    addiu $3, $sp, 0
+; MIPS32R6-NEXT:    addu $4, $3, $2
+; MIPS32R6-NEXT:    sw $zero, 28($sp)
+; MIPS32R6-NEXT:    sw $zero, 24($sp)
+; MIPS32R6-NEXT:    sw $zero, 20($sp)
+; MIPS32R6-NEXT:    sw $zero, 16($sp)
+; MIPS32R6-NEXT:    lw $5, 8($4)
+; MIPS32R6-NEXT:    srl $2, $5, 1
+; MIPS32R6-NEXT:    lw $3, 4($4)
+; MIPS32R6-NEXT:    andi $1, $1, 7
+; MIPS32R6-NEXT:    not $6, $1
+; MIPS32R6-NEXT:    andi $6, $6, 31
+; MIPS32R6-NEXT:    sllv $7, $3, $1
+; MIPS32R6-NEXT:    srlv $6, $2, $6
+; MIPS32R6-NEXT:    lw $2, 0($4)
+; MIPS32R6-NEXT:    sllv $2, $2, $1
+; MIPS32R6-NEXT:    srl $3, $3, 1
+; MIPS32R6-NEXT:    xori $8, $1, 31
+; MIPS32R6-NEXT:    srlv $3, $3, $8
+; MIPS32R6-NEXT:    or $2, $2, $3
+; MIPS32R6-NEXT:    or $3, $7, $6
+; MIPS32R6-NEXT:    sllv $5, $5, $1
+; MIPS32R6-NEXT:    lw $6, 12($4)
+; MIPS32R6-NEXT:    srl $4, $6, 1
+; MIPS32R6-NEXT:    srlv $4, $4, $8
+; MIPS32R6-NEXT:    or $4, $5, $4
+; MIPS32R6-NEXT:    sllv $5, $6, $1
 ; MIPS32R6-NEXT:    jr $ra
-; MIPS32R6-NEXT:    move $5, $1
+; MIPS32R6-NEXT:    addiu $sp, $sp, 32
 ;
 ; MIPS3-LABEL: shl_i128:
 ; MIPS3:       # %bb.0: # %entry
@@ -849,165 +743,95 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MMR3-NEXT:    swp $16, 32($sp)
 ; MMR3-NEXT:    .cfi_offset 17, -4
 ; MMR3-NEXT:    .cfi_offset 16, -8
-; MMR3-NEXT:    move $17, $7
-; MMR3-NEXT:    sw $7, 4($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    move $7, $6
-; MMR3-NEXT:    move $1, $4
-; MMR3-NEXT:    lw $16, 68($sp)
-; MMR3-NEXT:    li16 $2, 64
-; MMR3-NEXT:    subu16 $6, $2, $16
-; MMR3-NEXT:    srlv $9, $7, $6
-; MMR3-NEXT:    andi16 $4, $6, 32
-; MMR3-NEXT:    sw $4, 24($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    li16 $3, 0
-; MMR3-NEXT:    move $2, $9
-; MMR3-NEXT:    movn $2, $3, $4
-; MMR3-NEXT:    sllv $3, $1, $16
-; MMR3-NEXT:    sw $3, 16($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    not16 $4, $16
-; MMR3-NEXT:    sw $4, 20($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    sw $5, 28($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    srl16 $3, $5, 1
-; MMR3-NEXT:    srlv $3, $3, $4
-; MMR3-NEXT:    lw $4, 16($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    or16 $3, $4
-; MMR3-NEXT:    sllv $5, $5, $16
-; MMR3-NEXT:    sw $5, 8($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    andi16 $4, $16, 32
-; MMR3-NEXT:    sw $4, 16($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    movn $3, $5, $4
-; MMR3-NEXT:    addiu $4, $16, -64
-; MMR3-NEXT:    or16 $3, $2
-; MMR3-NEXT:    sllv $2, $7, $4
-; MMR3-NEXT:    sw $2, 12($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    srl16 $5, $17, 1
-; MMR3-NEXT:    not16 $2, $4
-; MMR3-NEXT:    srlv $2, $5, $2
-; MMR3-NEXT:    lw $17, 12($sp) # 4-byte Folded Reload
+; MMR3-NEXT:    li16 $2, 0
+; MMR3-NEXT:    swl $2, 28($sp)
+; MMR3-NEXT:    swl $2, 24($sp)
+; MMR3-NEXT:    swl $2, 20($sp)
+; MMR3-NEXT:    swl $2, 16($sp)
+; MMR3-NEXT:    swl $7, 12($sp)
+; MMR3-NEXT:    swl $6, 8($sp)
+; MMR3-NEXT:    swl $5, 4($sp)
+; MMR3-NEXT:    swl $4, 0($sp)
+; MMR3-NEXT:    swr $2, 31($sp)
+; MMR3-NEXT:    swr $2, 27($sp)
+; MMR3-NEXT:    swr $2, 23($sp)
+; MMR3-NEXT:    swr $2, 19($sp)
+; MMR3-NEXT:    swr $7, 15($sp)
+; MMR3-NEXT:    swr $6, 11($sp)
+; MMR3-NEXT:    swr $5, 7($sp)
+; MMR3-NEXT:    swr $4, 3($sp)
+; MMR3-NEXT:    lw $2, 68($sp)
+; MMR3-NEXT:    ext $3, $2, 3, 4
+; MMR3-NEXT:    addiur1sp $4, 0
+; MMR3-NEXT:    addu16 $4, $4, $3
+; MMR3-NEXT:    lwl $6, 8($4)
+; MMR3-NEXT:    lwr $6, 11($4)
+; MMR3-NEXT:    srl16 $3, $6, 1
+; MMR3-NEXT:    lwl $7, 4($4)
+; MMR3-NEXT:    lwr $7, 7($4)
+; MMR3-NEXT:    andi16 $5, $2, 7
+; MMR3-NEXT:    not16 $2, $5
+; MMR3-NEXT:    andi16 $2, $2, 31
+; MMR3-NEXT:    sllv $16, $7, $5
+; MMR3-NEXT:    srlv $3, $3, $2
+; MMR3-NEXT:    lwl $1, 0($4)
+; MMR3-NEXT:    lwr $1, 3($4)
+; MMR3-NEXT:    sllv $17, $1, $5
+; MMR3-NEXT:    srl16 $2, $7, 1
+; MMR3-NEXT:    xori $1, $5, 31
+; MMR3-NEXT:    srlv $2, $2, $1
 ; MMR3-NEXT:    or16 $2, $17
-; MMR3-NEXT:    lw $17, 4($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    sllv $8, $17, $4
-; MMR3-NEXT:    andi16 $4, $4, 32
-; MMR3-NEXT:    sw $4, 12($sp) # 4-byte Folded Spill
-; MMR3-NEXT:    movn $2, $8, $4
-; MMR3-NEXT:    sltiu $10, $16, 64
-; MMR3-NEXT:    movn $2, $3, $10
-; MMR3-NEXT:    srlv $4, $17, $6
-; MMR3-NEXT:    not16 $3, $6
-; MMR3-NEXT:    sll16 $6, $7, 1
-; MMR3-NEXT:    sllv $3, $6, $3
-; MMR3-NEXT:    or16 $3, $4
-; MMR3-NEXT:    sllv $6, $7, $16
-; MMR3-NEXT:    lw $4, 20($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    srlv $4, $5, $4
+; MMR3-NEXT:    or16 $3, $16
+; MMR3-NEXT:    sllv $6, $6, $5
+; MMR3-NEXT:    lwl $7, 12($4)
+; MMR3-NEXT:    lwr $7, 15($4)
+; MMR3-NEXT:    srl16 $4, $7, 1
+; MMR3-NEXT:    srlv $4, $4, $1
 ; MMR3-NEXT:    or16 $4, $6
-; MMR3-NEXT:    sllv $6, $17, $16
-; MMR3-NEXT:    lw $17, 16($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    movn $4, $6, $17
-; MMR3-NEXT:    movz $2, $1, $16
-; MMR3-NEXT:    li16 $5, 0
-; MMR3-NEXT:    movz $4, $5, $10
-; MMR3-NEXT:    lw $7, 24($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    movn $3, $9, $7
-; MMR3-NEXT:    lw $5, 8($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    li16 $7, 0
-; MMR3-NEXT:    movn $5, $7, $17
-; MMR3-NEXT:    or16 $5, $3
-; MMR3-NEXT:    lw $3, 12($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    movn $8, $7, $3
-; MMR3-NEXT:    movn $8, $5, $10
-; MMR3-NEXT:    lw $3, 28($sp) # 4-byte Folded Reload
-; MMR3-NEXT:    movz $8, $3, $16
-; MMR3-NEXT:    movn $6, $7, $17
-; MMR3-NEXT:    li16 $3, 0
-; MMR3-NEXT:    movz $6, $3, $10
-; MMR3-NEXT:    move $3, $8
-; MMR3-NEXT:    move $5, $6
+; MMR3-NEXT:    sllv $5, $7, $5
 ; MMR3-NEXT:    lwp $16, 32($sp)
 ; MMR3-NEXT:    addiusp 40
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: shl_i128:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    addiu $sp, $sp, -16
-; MMR6-NEXT:    .cfi_def_cfa_offset 16
-; MMR6-NEXT:    sw $17, 12($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    sw $16, 8($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    .cfi_offset 17, -4
-; MMR6-NEXT:    .cfi_offset 16, -8
-; MMR6-NEXT:    move $11, $4
-; MMR6-NEXT:    lw $3, 44($sp)
-; MMR6-NEXT:    sllv $1, $4, $3
-; MMR6-NEXT:    not16 $2, $3
-; MMR6-NEXT:    sw $2, 4($sp) # 4-byte Folded Spill
-; MMR6-NEXT:    srl16 $16, $5, 1
-; MMR6-NEXT:    srlv $8, $16, $2
-; MMR6-NEXT:    or $1, $1, $8
-; MMR6-NEXT:    sllv $8, $5, $3
-; MMR6-NEXT:    andi16 $16, $3, 32
-; MMR6-NEXT:    seleqz $1, $1, $16
-; MMR6-NEXT:    selnez $9, $8, $16
-; MMR6-NEXT:    li16 $17, 64
-; MMR6-NEXT:    subu16 $17, $17, $3
-; MMR6-NEXT:    srlv $10, $6, $17
-; MMR6-NEXT:    andi16 $2, $17, 32
-; MMR6-NEXT:    seleqz $12, $10, $2
-; MMR6-NEXT:    or $1, $9, $1
-; MMR6-NEXT:    selnez $9, $10, $2
-; MMR6-NEXT:    srlv $10, $7, $17
-; MMR6-NEXT:    not16 $17, $17
-; MMR6-NEXT:    sll16 $4, $6, 1
-; MMR6-NEXT:    sllv $4, $4, $17
-; MMR6-NEXT:    or $4, $4, $10
-; MMR6-NEXT:    seleqz $2, $4, $2
-; MMR6-NEXT:    addiu $4, $3, -64
-; MMR6-NEXT:    or $10, $9, $2
-; MMR6-NEXT:    or $1, $1, $12
-; MMR6-NEXT:    sllv $9, $6, $4
-; MMR6-NEXT:    srl16 $2, $7, 1
-; MMR6-NEXT:    not16 $17, $4
-; MMR6-NEXT:    srlv $12, $2, $17
-; MMR6-NEXT:    or $9, $9, $12
-; MMR6-NEXT:    andi16 $17, $4, 32
-; MMR6-NEXT:    seleqz $9, $9, $17
-; MMR6-NEXT:    sllv $14, $7, $4
-; MMR6-NEXT:    selnez $12, $14, $17
-; MMR6-NEXT:    sltiu $13, $3, 64
-; MMR6-NEXT:    selnez $1, $1, $13
-; MMR6-NEXT:    or $9, $12, $9
-; MMR6-NEXT:    sllv $6, $6, $3
-; MMR6-NEXT:    lw $4, 4($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    srlv $2, $2, $4
-; MMR6-NEXT:    seleqz $8, $8, $16
-; MMR6-NEXT:    li16 $4, 0
-; MMR6-NEXT:    or $8, $8, $10
-; MMR6-NEXT:    or $6, $6, $2
-; MMR6-NEXT:    seleqz $2, $9, $13
-; MMR6-NEXT:    seleqz $9, $4, $13
-; MMR6-NEXT:    sllv $7, $7, $3
-; MMR6-NEXT:    seleqz $10, $7, $16
-; MMR6-NEXT:    selnez $10, $10, $13
-; MMR6-NEXT:    seleqz $11, $11, $3
-; MMR6-NEXT:    or $1, $1, $2
-; MMR6-NEXT:    selnez $1, $1, $3
-; MMR6-NEXT:    or $2, $11, $1
-; MMR6-NEXT:    or $1, $9, $10
-; MMR6-NEXT:    seleqz $6, $6, $16
-; MMR6-NEXT:    selnez $7, $7, $16
-; MMR6-NEXT:    seleqz $5, $5, $3
-; MMR6-NEXT:    selnez $8, $8, $13
-; MMR6-NEXT:    seleqz $4, $14, $17
-; MMR6-NEXT:    seleqz $4, $4, $13
-; MMR6-NEXT:    or $4, $8, $4
-; MMR6-NEXT:    selnez $3, $4, $3
-; MMR6-NEXT:    or $3, $5, $3
-; MMR6-NEXT:    or $4, $7, $6
-; MMR6-NEXT:    selnez $4, $4, $13
-; MMR6-NEXT:    or $4, $9, $4
-; MMR6-NEXT:    move $5, $1
-; MMR6-NEXT:    lw $16, 8($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    lw $17, 12($sp) # 4-byte Folded Reload
-; MMR6-NEXT:    addiu $sp, $sp, 16
+; MMR6-NEXT:    addiu $sp, $sp, -32
+; MMR6-NEXT:    .cfi_def_cfa_offset 32
+; MMR6-NEXT:    li16 $2, 0
+; MMR6-NEXT:    sw $2, 28($sp)
+; MMR6-NEXT:    sw $2, 24($sp)
+; MMR6-NEXT:    sw $2, 20($sp)
+; MMR6-NEXT:    sw $2, 16($sp)
+; MMR6-NEXT:    sw $7, 12($sp)
+; MMR6-NEXT:    sw $6, 8($sp)
+; MMR6-NEXT:    sw $5, 4($sp)
+; MMR6-NEXT:    sw $4, 0($sp)
+; MMR6-NEXT:    lw $2, 60($sp)
+; MMR6-NEXT:    ext $3, $2, 3, 4
+; MMR6-NEXT:    addiu $4, $sp, 0
+; MMR6-NEXT:    addu16 $4, $4, $3
+; MMR6-NEXT:    lw16 $6, 8($4)
+; MMR6-NEXT:    srl16 $3, $6, 1
+; MMR6-NEXT:    lw16 $7, 4($4)
+; MMR6-NEXT:    andi16 $5, $2, 7
+; MMR6-NEXT:    not16 $2, $5
+; MMR6-NEXT:    andi16 $2, $2, 31
+; MMR6-NEXT:    sllv $1, $7, $5
+; MMR6-NEXT:    srlv $3, $3, $2
+; MMR6-NEXT:    lw16 $2, 0($4)
+; MMR6-NEXT:    sllv $2, $2, $5
+; MMR6-NEXT:    srl16 $7, $7, 1
+; MMR6-NEXT:    xori $8, $5, 31
+; MMR6-NEXT:    srlv $7, $7, $8
+; MMR6-NEXT:    or $2, $2, $7
+; MMR6-NEXT:    or $3, $1, $3
+; MMR6-NEXT:    sllv $1, $6, $5
+; MMR6-NEXT:    lw16 $6, 12($4)
+; MMR6-NEXT:    srl16 $4, $6, 1
+; MMR6-NEXT:    srlv $4, $4, $8
+; MMR6-NEXT:    or $4, $1, $4
+; MMR6-NEXT:    sllv $5, $6, $5
+; MMR6-NEXT:    addiu $sp, $sp, 32
 ; MMR6-NEXT:    jrc $ra
 entry:
 

diff  --git a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
index b8d60536c86c6..c48361e0a8035 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
@@ -7,97 +7,60 @@ target triple = "powerpc-ellcc-linux"
 define void @foo1(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
 ; CHECK-LABEL: foo1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    stwu 1, -48(1)
-; CHECK-NEXT:    stw 24, 16(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 6, 2048
-; CHECK-NEXT:    stw 25, 20(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 26, 24(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 27, 28(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 28, 32(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 29, 36(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 30, 40(1) # 4-byte Folded Spill
-; CHECK-NEXT:    mtctr 6
+; CHECK-NEXT:    stwu 1, -64(1)
+; CHECK-NEXT:    stw 28, 48(1) # 4-byte Folded Spill
+; CHECK-NEXT:    li 8, 2048
+; CHECK-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
 ; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    stw 30, 56(1) # 4-byte Folded Spill
+; CHECK-NEXT:    li 7, 7
+; CHECK-NEXT:    mtctr 8
+; CHECK-NEXT:    addi 8, 1, 16
 ; CHECK-NEXT:  .LBB0_1: # %for.body
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lwz 9, 12(5)
-; CHECK-NEXT:    lwz 10, 8(4)
-; CHECK-NEXT:    lwz 11, 12(4)
-; CHECK-NEXT:    subfic 12, 9, 96
-; CHECK-NEXT:    lwz 7, 4(4)
-; CHECK-NEXT:    addi 0, 9, -64
-; CHECK-NEXT:    lwz 8, 0(4)
-; CHECK-NEXT:    subfic 28, 9, 32
-; CHECK-NEXT:    cmplwi 9, 64
-; CHECK-NEXT:    slw 26, 11, 9
-; CHECK-NEXT:    srw 12, 11, 12
-; CHECK-NEXT:    slw 25, 10, 0
-; CHECK-NEXT:    addi 30, 9, -96
-; CHECK-NEXT:    slw 29, 8, 9
-; CHECK-NEXT:    or 12, 25, 12
-; CHECK-NEXT:    srw 25, 7, 28
-; CHECK-NEXT:    bc 12, 0, .LBB0_3
-; CHECK-NEXT:  # %bb.2: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ori 26, 6, 0
-; CHECK-NEXT:    b .LBB0_3
-; CHECK-NEXT:  .LBB0_3: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 27, 10, 9
-; CHECK-NEXT:    or 29, 29, 25
-; CHECK-NEXT:    srw 25, 11, 28
-; CHECK-NEXT:    stw 26, 12(3)
-; CHECK-NEXT:    subfic 26, 9, 64
-; CHECK-NEXT:    slw 30, 11, 30
-; CHECK-NEXT:    or 27, 27, 25
-; CHECK-NEXT:    addi 25, 9, -32
-; CHECK-NEXT:    or 12, 12, 30
-; CHECK-NEXT:    subfic 30, 26, 32
-; CHECK-NEXT:    srw 28, 10, 28
-; CHECK-NEXT:    slw 30, 10, 30
-; CHECK-NEXT:    srw 10, 10, 26
-; CHECK-NEXT:    srw 26, 11, 26
-; CHECK-NEXT:    slw 24, 11, 0
-; CHECK-NEXT:    slw 0, 7, 25
-; CHECK-NEXT:    or 0, 29, 0
-; CHECK-NEXT:    or 30, 26, 30
-; CHECK-NEXT:    cmplwi 1, 9, 0
-; CHECK-NEXT:    slw 9, 7, 9
-; CHECK-NEXT:    or 10, 0, 10
-; CHECK-NEXT:    or 0, 30, 28
-; CHECK-NEXT:    slw 11, 11, 25
-; CHECK-NEXT:    or 9, 9, 0
-; CHECK-NEXT:    or 11, 27, 11
-; CHECK-NEXT:    bc 12, 0, .LBB0_5
-; CHECK-NEXT:  # %bb.4: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ori 10, 12, 0
-; CHECK-NEXT:    ori 9, 24, 0
-; CHECK-NEXT:    ori 11, 6, 0
-; CHECK-NEXT:    b .LBB0_5
-; CHECK-NEXT:  .LBB0_5: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    bc 12, 6, .LBB0_7
-; CHECK-NEXT:  # %bb.6: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ori 8, 10, 0
-; CHECK-NEXT:    ori 7, 9, 0
-; CHECK-NEXT:    b .LBB0_7
-; CHECK-NEXT:  .LBB0_7: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stw 11, 8(3)
-; CHECK-NEXT:    stw 8, 0(3)
-; CHECK-NEXT:    stw 7, 4(3)
+; CHECK-NEXT:    lwz 9, 0(4)
+; CHECK-NEXT:    lwz 10, 4(4)
+; CHECK-NEXT:    lwz 11, 8(4)
+; CHECK-NEXT:    lwz 12, 12(4)
+; CHECK-NEXT:    lwz 0, 12(5)
+; CHECK-NEXT:    stw 6, 44(1)
+; CHECK-NEXT:    stw 6, 40(1)
+; CHECK-NEXT:    stw 6, 36(1)
+; CHECK-NEXT:    stw 6, 32(1)
+; CHECK-NEXT:    stw 12, 28(1)
+; CHECK-NEXT:    clrlwi 12, 0, 29
+; CHECK-NEXT:    stw 11, 24(1)
+; CHECK-NEXT:    nand 11, 0, 7
+; CHECK-NEXT:    stw 10, 20(1)
+; CHECK-NEXT:    subfic 29, 12, 32
+; CHECK-NEXT:    stw 9, 16(1)
+; CHECK-NEXT:    rlwinm 9, 0, 29, 28, 31
+; CHECK-NEXT:    lwzux 10, 9, 8
+; CHECK-NEXT:    clrlwi 11, 11, 27
+; CHECK-NEXT:    lwz 0, 8(9)
+; CHECK-NEXT:    slw 10, 10, 12
+; CHECK-NEXT:    lwz 30, 4(9)
+; CHECK-NEXT:    lwz 9, 12(9)
+; CHECK-NEXT:    slw 28, 30, 12
+; CHECK-NEXT:    srw 30, 30, 29
+; CHECK-NEXT:    srw 29, 9, 29
+; CHECK-NEXT:    slw 9, 9, 12
+; CHECK-NEXT:    slw 12, 0, 12
+; CHECK-NEXT:    srwi 0, 0, 1
+; CHECK-NEXT:    stw 9, 12(3)
+; CHECK-NEXT:    or 9, 12, 29
+; CHECK-NEXT:    srw 11, 0, 11
+; CHECK-NEXT:    stw 9, 8(3)
+; CHECK-NEXT:    or 9, 10, 30
+; CHECK-NEXT:    stw 9, 0(3)
+; CHECK-NEXT:    or 9, 28, 11
+; CHECK-NEXT:    stw 9, 4(3)
 ; CHECK-NEXT:    bdnz .LBB0_1
-; CHECK-NEXT:  # %bb.8: # %for.end
-; CHECK-NEXT:    lwz 30, 40(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 29, 36(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 28, 32(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 27, 28(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 26, 24(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 25, 20(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 24, 16(1) # 4-byte Folded Reload
-; CHECK-NEXT:    addi 1, 1, 48
+; CHECK-NEXT:  # %bb.2: # %for.end
+; CHECK-NEXT:    lwz 30, 56(1) # 4-byte Folded Reload
+; CHECK-NEXT:    lwz 29, 52(1) # 4-byte Folded Reload
+; CHECK-NEXT:    lwz 28, 48(1) # 4-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 64
 ; CHECK-NEXT:    blr
 entry:
   br label %for.body
@@ -120,114 +83,59 @@ for.end:                                          ; preds = %for.body
 define void @foo2(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
 ; CHECK-LABEL: foo2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    stwu 1, -48(1)
-; CHECK-NEXT:    stw 24, 16(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 6, 2048
-; CHECK-NEXT:    stw 25, 20(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 26, 24(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 27, 28(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 28, 32(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 29, 36(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 30, 40(1) # 4-byte Folded Spill
-; CHECK-NEXT:    mtctr 6
+; CHECK-NEXT:    stwu 1, -64(1)
+; CHECK-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
+; CHECK-NEXT:    li 7, 2048
+; CHECK-NEXT:    stw 30, 56(1) # 4-byte Folded Spill
+; CHECK-NEXT:    li 6, 7
+; CHECK-NEXT:    mtctr 7
+; CHECK-NEXT:    addi 7, 1, 36
 ; CHECK-NEXT:  .LBB1_1: # %for.body
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lwz 8, 12(5)
+; CHECK-NEXT:    lwz 8, 0(4)
+; CHECK-NEXT:    lwz 10, 8(4)
+; CHECK-NEXT:    lwz 12, 12(5)
 ; CHECK-NEXT:    lwz 9, 4(4)
-; CHECK-NEXT:    lwz 10, 0(4)
-; CHECK-NEXT:    subfic 11, 8, 96
-; CHECK-NEXT:    lwz 6, 8(4)
-; CHECK-NEXT:    addi 12, 8, -64
-; CHECK-NEXT:    lwz 7, 12(4)
-; CHECK-NEXT:    subfic 29, 8, 32
-; CHECK-NEXT:    slw 11, 10, 11
-; CHECK-NEXT:    srw 25, 9, 12
-; CHECK-NEXT:    srw 30, 7, 8
-; CHECK-NEXT:    or 11, 25, 11
-; CHECK-NEXT:    slw 25, 6, 29
-; CHECK-NEXT:    srw 27, 9, 8
-; CHECK-NEXT:    or 30, 30, 25
-; CHECK-NEXT:    slw 25, 10, 29
-; CHECK-NEXT:    addi 0, 8, -96
-; CHECK-NEXT:    cmplwi 8, 64
-; CHECK-NEXT:    srawi 26, 10, 31
-; CHECK-NEXT:    or 27, 27, 25
-; CHECK-NEXT:    sraw 25, 10, 8
-; CHECK-NEXT:    cmpwi 1, 0, 1
-; CHECK-NEXT:    sraw 24, 10, 0
-; CHECK-NEXT:    bc 12, 0, .LBB1_3
-; CHECK-NEXT:  # %bb.2: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ori 0, 26, 0
-; CHECK-NEXT:    b .LBB1_4
-; CHECK-NEXT:  .LBB1_3: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    addi 0, 25, 0
-; CHECK-NEXT:  .LBB1_4: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    addi 28, 8, -32
+; CHECK-NEXT:    lwz 11, 12(4)
+; CHECK-NEXT:    stw 10, 44(1)
+; CHECK-NEXT:    rlwinm 10, 12, 29, 28, 31
+; CHECK-NEXT:    stw 8, 36(1)
+; CHECK-NEXT:    srawi 8, 8, 31
+; CHECK-NEXT:    stw 11, 48(1)
+; CHECK-NEXT:    clrlwi 11, 12, 29
+; CHECK-NEXT:    stw 9, 40(1)
+; CHECK-NEXT:    nand 9, 12, 6
+; CHECK-NEXT:    stw 8, 32(1)
+; CHECK-NEXT:    subfic 30, 11, 32
+; CHECK-NEXT:    stw 8, 28(1)
+; CHECK-NEXT:    clrlwi 9, 9, 27
+; CHECK-NEXT:    stw 8, 24(1)
+; CHECK-NEXT:    stw 8, 20(1)
+; CHECK-NEXT:    sub 8, 7, 10
+; CHECK-NEXT:    lwz 10, 4(8)
+; CHECK-NEXT:    lwz 12, 8(8)
+; CHECK-NEXT:    lwz 0, 0(8)
+; CHECK-NEXT:    lwz 8, 12(8)
+; CHECK-NEXT:    srw 29, 12, 11
+; CHECK-NEXT:    slw 12, 12, 30
+; CHECK-NEXT:    slw 30, 0, 30
+; CHECK-NEXT:    srw 8, 8, 11
+; CHECK-NEXT:    sraw 0, 0, 11
+; CHECK-NEXT:    srw 11, 10, 11
+; CHECK-NEXT:    slwi 10, 10, 1
+; CHECK-NEXT:    or 8, 12, 8
+; CHECK-NEXT:    slw 9, 10, 9
+; CHECK-NEXT:    stw 8, 12(3)
+; CHECK-NEXT:    or 8, 30, 11
+; CHECK-NEXT:    stw 8, 4(3)
+; CHECK-NEXT:    or 8, 29, 9
 ; CHECK-NEXT:    stw 0, 0(3)
-; CHECK-NEXT:    subfic 0, 8, 64
-; CHECK-NEXT:    subfic 25, 0, 32
-; CHECK-NEXT:    slw 29, 9, 29
-; CHECK-NEXT:    srw 25, 9, 25
-; CHECK-NEXT:    slw 9, 9, 0
-; CHECK-NEXT:    slw 0, 10, 0
-; CHECK-NEXT:    bc 12, 4, .LBB1_6
-; CHECK-NEXT:  # %bb.5: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ori 11, 24, 0
-; CHECK-NEXT:    b .LBB1_6
-; CHECK-NEXT:  .LBB1_6: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    sraw 12, 10, 12
-; CHECK-NEXT:    sraw 10, 10, 28
-; CHECK-NEXT:    cmpwi 1, 28, 1
-; CHECK-NEXT:    srw 28, 6, 28
-; CHECK-NEXT:    or 0, 0, 25
-; CHECK-NEXT:    or 30, 30, 28
-; CHECK-NEXT:    bc 12, 4, .LBB1_7
-; CHECK-NEXT:    b .LBB1_8
-; CHECK-NEXT:  .LBB1_7: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    addi 10, 27, 0
-; CHECK-NEXT:  .LBB1_8: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmplwi 1, 8, 0
-; CHECK-NEXT:    srw 8, 6, 8
-; CHECK-NEXT:    or 0, 0, 29
-; CHECK-NEXT:    or 9, 30, 9
-; CHECK-NEXT:    or 8, 8, 0
-; CHECK-NEXT:    bc 12, 0, .LBB1_10
-; CHECK-NEXT:  # %bb.9: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ori 9, 11, 0
-; CHECK-NEXT:    ori 8, 12, 0
-; CHECK-NEXT:    ori 10, 26, 0
-; CHECK-NEXT:    b .LBB1_10
-; CHECK-NEXT:  .LBB1_10: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    bc 12, 6, .LBB1_12
-; CHECK-NEXT:  # %bb.11: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ori 7, 9, 0
-; CHECK-NEXT:    ori 6, 8, 0
-; CHECK-NEXT:    b .LBB1_12
-; CHECK-NEXT:  .LBB1_12: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stw 10, 4(3)
-; CHECK-NEXT:    stw 7, 12(3)
-; CHECK-NEXT:    stw 6, 8(3)
+; CHECK-NEXT:    stw 8, 8(3)
 ; CHECK-NEXT:    bdnz .LBB1_1
-; CHECK-NEXT:  # %bb.13: # %for.end
-; CHECK-NEXT:    lwz 30, 40(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 29, 36(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 28, 32(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 27, 28(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 26, 24(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 25, 20(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 24, 16(1) # 4-byte Folded Reload
-; CHECK-NEXT:    addi 1, 1, 48
+; CHECK-NEXT:  # %bb.2: # %for.end
+; CHECK-NEXT:    lwz 30, 56(1) # 4-byte Folded Reload
+; CHECK-NEXT:    lwz 29, 52(1) # 4-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 64
 ; CHECK-NEXT:    blr
 entry:
   br label %for.body
@@ -250,97 +158,61 @@ for.end:                                          ; preds = %for.body
 define void @foo3(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
 ; CHECK-LABEL: foo3:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    stwu 1, -48(1)
-; CHECK-NEXT:    stw 24, 16(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 6, 2048
-; CHECK-NEXT:    stw 25, 20(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 26, 24(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 27, 28(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 28, 32(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 29, 36(1) # 4-byte Folded Spill
-; CHECK-NEXT:    stw 30, 40(1) # 4-byte Folded Spill
-; CHECK-NEXT:    mtctr 6
+; CHECK-NEXT:    stwu 1, -64(1)
+; CHECK-NEXT:    stw 28, 48(1) # 4-byte Folded Spill
+; CHECK-NEXT:    li 8, 2048
+; CHECK-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
 ; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    stw 30, 56(1) # 4-byte Folded Spill
+; CHECK-NEXT:    li 7, 7
+; CHECK-NEXT:    mtctr 8
+; CHECK-NEXT:    addi 8, 1, 32
 ; CHECK-NEXT:  .LBB2_1: # %for.body
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lwz 9, 12(5)
 ; CHECK-NEXT:    lwz 10, 4(4)
-; CHECK-NEXT:    lwz 11, 0(4)
-; CHECK-NEXT:    subfic 12, 9, 96
-; CHECK-NEXT:    lwz 7, 8(4)
-; CHECK-NEXT:    addi 0, 9, -64
-; CHECK-NEXT:    lwz 8, 12(4)
-; CHECK-NEXT:    subfic 28, 9, 32
-; CHECK-NEXT:    cmplwi 9, 64
-; CHECK-NEXT:    srw 26, 11, 9
-; CHECK-NEXT:    slw 12, 11, 12
-; CHECK-NEXT:    srw 25, 10, 0
-; CHECK-NEXT:    addi 30, 9, -96
-; CHECK-NEXT:    srw 29, 8, 9
-; CHECK-NEXT:    or 12, 25, 12
-; CHECK-NEXT:    slw 25, 7, 28
-; CHECK-NEXT:    bc 12, 0, .LBB2_3
-; CHECK-NEXT:  # %bb.2: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ori 26, 6, 0
-; CHECK-NEXT:    b .LBB2_3
-; CHECK-NEXT:  .LBB2_3: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 27, 10, 9
-; CHECK-NEXT:    or 29, 29, 25
-; CHECK-NEXT:    slw 25, 11, 28
-; CHECK-NEXT:    stw 26, 0(3)
-; CHECK-NEXT:    subfic 26, 9, 64
-; CHECK-NEXT:    srw 30, 11, 30
-; CHECK-NEXT:    or 27, 27, 25
-; CHECK-NEXT:    addi 25, 9, -32
-; CHECK-NEXT:    or 12, 12, 30
-; CHECK-NEXT:    subfic 30, 26, 32
-; CHECK-NEXT:    slw 28, 10, 28
-; CHECK-NEXT:    srw 30, 10, 30
-; CHECK-NEXT:    slw 10, 10, 26
-; CHECK-NEXT:    slw 26, 11, 26
-; CHECK-NEXT:    srw 24, 11, 0
-; CHECK-NEXT:    srw 0, 7, 25
-; CHECK-NEXT:    or 0, 29, 0
-; CHECK-NEXT:    or 30, 26, 30
-; CHECK-NEXT:    cmplwi 1, 9, 0
-; CHECK-NEXT:    srw 9, 7, 9
+; CHECK-NEXT:    lwz 0, 12(5)
+; CHECK-NEXT:    lwz 9, 0(4)
+; CHECK-NEXT:    lwz 11, 8(4)
+; CHECK-NEXT:    lwz 12, 12(4)
+; CHECK-NEXT:    stw 10, 36(1)
+; CHECK-NEXT:    rlwinm 10, 0, 29, 28, 31
+; CHECK-NEXT:    stw 6, 28(1)
+; CHECK-NEXT:    sub 10, 8, 10
+; CHECK-NEXT:    stw 6, 24(1)
+; CHECK-NEXT:    stw 6, 20(1)
+; CHECK-NEXT:    stw 6, 16(1)
+; CHECK-NEXT:    stw 12, 44(1)
+; CHECK-NEXT:    clrlwi 12, 0, 29
+; CHECK-NEXT:    stw 11, 40(1)
+; CHECK-NEXT:    subfic 29, 12, 32
+; CHECK-NEXT:    stw 9, 32(1)
+; CHECK-NEXT:    nand 9, 0, 7
+; CHECK-NEXT:    lwz 11, 4(10)
+; CHECK-NEXT:    clrlwi 9, 9, 27
+; CHECK-NEXT:    lwz 0, 8(10)
+; CHECK-NEXT:    lwz 30, 0(10)
+; CHECK-NEXT:    lwz 10, 12(10)
+; CHECK-NEXT:    srw 28, 0, 12
+; CHECK-NEXT:    slw 0, 0, 29
+; CHECK-NEXT:    slw 29, 30, 29
+; CHECK-NEXT:    srw 10, 10, 12
+; CHECK-NEXT:    srw 30, 30, 12
+; CHECK-NEXT:    srw 12, 11, 12
+; CHECK-NEXT:    slwi 11, 11, 1
+; CHECK-NEXT:    slw 9, 11, 9
 ; CHECK-NEXT:    or 10, 0, 10
-; CHECK-NEXT:    or 0, 30, 28
-; CHECK-NEXT:    srw 11, 11, 25
-; CHECK-NEXT:    or 9, 9, 0
-; CHECK-NEXT:    or 11, 27, 11
-; CHECK-NEXT:    bc 12, 0, .LBB2_5
-; CHECK-NEXT:  # %bb.4: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ori 10, 12, 0
-; CHECK-NEXT:    ori 9, 24, 0
-; CHECK-NEXT:    ori 11, 6, 0
-; CHECK-NEXT:    b .LBB2_5
-; CHECK-NEXT:  .LBB2_5: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    bc 12, 6, .LBB2_7
-; CHECK-NEXT:  # %bb.6: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ori 8, 10, 0
-; CHECK-NEXT:    ori 7, 9, 0
-; CHECK-NEXT:    b .LBB2_7
-; CHECK-NEXT:  .LBB2_7: # %for.body
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stw 11, 4(3)
-; CHECK-NEXT:    stw 8, 12(3)
-; CHECK-NEXT:    stw 7, 8(3)
+; CHECK-NEXT:    stw 10, 12(3)
+; CHECK-NEXT:    or 10, 29, 12
+; CHECK-NEXT:    or 9, 28, 9
+; CHECK-NEXT:    stw 30, 0(3)
+; CHECK-NEXT:    stw 10, 4(3)
+; CHECK-NEXT:    stw 9, 8(3)
 ; CHECK-NEXT:    bdnz .LBB2_1
-; CHECK-NEXT:  # %bb.8: # %for.end
-; CHECK-NEXT:    lwz 30, 40(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 29, 36(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 28, 32(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 27, 28(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 26, 24(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 25, 20(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 24, 16(1) # 4-byte Folded Reload
-; CHECK-NEXT:    addi 1, 1, 48
+; CHECK-NEXT:  # %bb.2: # %for.end
+; CHECK-NEXT:    lwz 30, 56(1) # 4-byte Folded Reload
+; CHECK-NEXT:    lwz 29, 52(1) # 4-byte Folded Reload
+; CHECK-NEXT:    lwz 28, 48(1) # 4-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 64
 ; CHECK-NEXT:    blr
 entry:
   br label %for.body

diff  --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
index 1579bcdb6e852..6bbd94b28faaf 100644
--- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -226,93 +226,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; LE-32BIT-LABEL: lshr_16bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -32(1)
+; LE-32BIT-NEXT:    stwu 1, -48(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    li 6, 0
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 3, 12(3)
 ; LE-32BIT-NEXT:    lwz 4, 12(4)
-; LE-32BIT-NEXT:    li 8, 0
-; LE-32BIT-NEXT:    lwz 6, 8(3)
-; LE-32BIT-NEXT:    lwz 7, 12(3)
-; LE-32BIT-NEXT:    rlwinm. 4, 4, 3, 0, 28
-; LE-32BIT-NEXT:    lwz 9, 4(3)
-; LE-32BIT-NEXT:    subfic 10, 4, 96
-; LE-32BIT-NEXT:    lwz 3, 0(3)
-; LE-32BIT-NEXT:    addi 11, 4, -64
-; LE-32BIT-NEXT:    stw 27, 12(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    cmplwi 1, 4, 64
-; LE-32BIT-NEXT:    stw 28, 16(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 28, 3, 4
-; LE-32BIT-NEXT:    stw 30, 24(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 30, 4, 32
-; LE-32BIT-NEXT:    slw 10, 3, 10
-; LE-32BIT-NEXT:    srw 27, 9, 11
-; LE-32BIT-NEXT:    addi 12, 4, -96
-; LE-32BIT-NEXT:    srw 0, 7, 4
-; LE-32BIT-NEXT:    or 10, 27, 10
-; LE-32BIT-NEXT:    slw 27, 6, 30
-; LE-32BIT-NEXT:    bc 12, 4, .LBB6_2
-; LE-32BIT-NEXT:  # %bb.1:
-; LE-32BIT-NEXT:    ori 28, 8, 0
-; LE-32BIT-NEXT:    b .LBB6_2
-; LE-32BIT-NEXT:  .LBB6_2:
-; LE-32BIT-NEXT:    stw 29, 20(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 29, 9, 4
-; LE-32BIT-NEXT:    or 0, 0, 27
-; LE-32BIT-NEXT:    slw 27, 3, 30
-; LE-32BIT-NEXT:    stw 28, 0(5)
-; LE-32BIT-NEXT:    subfic 28, 4, 64
-; LE-32BIT-NEXT:    srw 12, 3, 12
-; LE-32BIT-NEXT:    or 29, 29, 27
-; LE-32BIT-NEXT:    addi 27, 4, -32
-; LE-32BIT-NEXT:    or 10, 10, 12
-; LE-32BIT-NEXT:    subfic 12, 28, 32
-; LE-32BIT-NEXT:    slw 30, 9, 30
-; LE-32BIT-NEXT:    srw 12, 9, 12
-; LE-32BIT-NEXT:    slw 9, 9, 28
-; LE-32BIT-NEXT:    slw 28, 3, 28
-; LE-32BIT-NEXT:    srw 11, 3, 11
-; LE-32BIT-NEXT:    srw 3, 3, 27
-; LE-32BIT-NEXT:    srw 27, 6, 27
-; LE-32BIT-NEXT:    or 0, 0, 27
-; LE-32BIT-NEXT:    or 12, 28, 12
-; LE-32BIT-NEXT:    srw 4, 6, 4
-; LE-32BIT-NEXT:    or 3, 29, 3
-; LE-32BIT-NEXT:    or 9, 0, 9
-; LE-32BIT-NEXT:    or 12, 12, 30
-; LE-32BIT-NEXT:    bc 12, 4, .LBB6_4
-; LE-32BIT-NEXT:  # %bb.3:
-; LE-32BIT-NEXT:    ori 3, 8, 0
-; LE-32BIT-NEXT:    ori 8, 10, 0
-; LE-32BIT-NEXT:    b .LBB6_5
-; LE-32BIT-NEXT:  .LBB6_4:
-; LE-32BIT-NEXT:    addi 8, 9, 0
-; LE-32BIT-NEXT:  .LBB6_5:
-; LE-32BIT-NEXT:    or 4, 4, 12
-; LE-32BIT-NEXT:    stw 3, 4(5)
-; LE-32BIT-NEXT:    bc 12, 2, .LBB6_7
-; LE-32BIT-NEXT:  # %bb.6:
-; LE-32BIT-NEXT:    ori 3, 8, 0
-; LE-32BIT-NEXT:    b .LBB6_8
-; LE-32BIT-NEXT:  .LBB6_7:
-; LE-32BIT-NEXT:    addi 3, 7, 0
-; LE-32BIT-NEXT:  .LBB6_8:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB6_10
-; LE-32BIT-NEXT:  # %bb.9:
-; LE-32BIT-NEXT:    ori 4, 11, 0
-; LE-32BIT-NEXT:    b .LBB6_10
-; LE-32BIT-NEXT:  .LBB6_10:
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    addi 3, 1, 32
+; LE-32BIT-NEXT:    clrlwi 4, 4, 28
+; LE-32BIT-NEXT:    stw 6, 28(1)
+; LE-32BIT-NEXT:    sub 3, 3, 4
+; LE-32BIT-NEXT:    stw 6, 24(1)
+; LE-32BIT-NEXT:    stw 6, 20(1)
+; LE-32BIT-NEXT:    stw 6, 16(1)
+; LE-32BIT-NEXT:    stw 9, 40(1)
+; LE-32BIT-NEXT:    stw 8, 36(1)
+; LE-32BIT-NEXT:    stw 7, 32(1)
+; LE-32BIT-NEXT:    lwz 4, 4(3)
+; LE-32BIT-NEXT:    lwz 6, 0(3)
+; LE-32BIT-NEXT:    lwz 7, 8(3)
+; LE-32BIT-NEXT:    lwz 3, 12(3)
+; LE-32BIT-NEXT:    stw 7, 8(5)
 ; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    bc 12, 2, .LBB6_12
-; LE-32BIT-NEXT:  # %bb.11:
-; LE-32BIT-NEXT:    ori 3, 4, 0
-; LE-32BIT-NEXT:    b .LBB6_13
-; LE-32BIT-NEXT:  .LBB6_12:
-; LE-32BIT-NEXT:    addi 3, 6, 0
-; LE-32BIT-NEXT:  .LBB6_13:
-; LE-32BIT-NEXT:    stw 3, 8(5)
-; LE-32BIT-NEXT:    lwz 30, 24(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 29, 20(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 28, 16(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 27, 12(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    addi 1, 1, 32
+; LE-32BIT-NEXT:    stw 6, 0(5)
+; LE-32BIT-NEXT:    stw 4, 4(5)
+; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -360,93 +300,32 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; LE-32BIT-LABEL: shl_16bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -32(1)
-; LE-32BIT-NEXT:    lwz 4, 12(4)
-; LE-32BIT-NEXT:    li 8, 0
-; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    stwu 1, -48(1)
 ; LE-32BIT-NEXT:    lwz 7, 0(3)
-; LE-32BIT-NEXT:    rlwinm. 4, 4, 3, 0, 28
+; LE-32BIT-NEXT:    li 6, 0
+; LE-32BIT-NEXT:    lwz 8, 4(3)
 ; LE-32BIT-NEXT:    lwz 9, 8(3)
-; LE-32BIT-NEXT:    subfic 10, 4, 96
 ; LE-32BIT-NEXT:    lwz 3, 12(3)
-; LE-32BIT-NEXT:    addi 11, 4, -64
-; LE-32BIT-NEXT:    stw 27, 12(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    cmplwi 1, 4, 64
-; LE-32BIT-NEXT:    stw 28, 16(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    slw 28, 3, 4
-; LE-32BIT-NEXT:    stw 30, 24(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 30, 4, 32
-; LE-32BIT-NEXT:    srw 10, 3, 10
-; LE-32BIT-NEXT:    slw 27, 9, 11
-; LE-32BIT-NEXT:    addi 12, 4, -96
-; LE-32BIT-NEXT:    slw 0, 7, 4
-; LE-32BIT-NEXT:    or 10, 27, 10
-; LE-32BIT-NEXT:    srw 27, 6, 30
-; LE-32BIT-NEXT:    bc 12, 4, .LBB7_2
-; LE-32BIT-NEXT:  # %bb.1:
-; LE-32BIT-NEXT:    ori 28, 8, 0
-; LE-32BIT-NEXT:    b .LBB7_2
-; LE-32BIT-NEXT:  .LBB7_2:
-; LE-32BIT-NEXT:    stw 29, 20(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    slw 29, 9, 4
-; LE-32BIT-NEXT:    or 0, 0, 27
-; LE-32BIT-NEXT:    srw 27, 3, 30
-; LE-32BIT-NEXT:    stw 28, 12(5)
-; LE-32BIT-NEXT:    subfic 28, 4, 64
-; LE-32BIT-NEXT:    slw 12, 3, 12
-; LE-32BIT-NEXT:    or 29, 29, 27
-; LE-32BIT-NEXT:    addi 27, 4, -32
-; LE-32BIT-NEXT:    or 10, 10, 12
-; LE-32BIT-NEXT:    subfic 12, 28, 32
-; LE-32BIT-NEXT:    srw 30, 9, 30
-; LE-32BIT-NEXT:    slw 12, 9, 12
-; LE-32BIT-NEXT:    srw 9, 9, 28
-; LE-32BIT-NEXT:    srw 28, 3, 28
-; LE-32BIT-NEXT:    slw 11, 3, 11
-; LE-32BIT-NEXT:    slw 3, 3, 27
-; LE-32BIT-NEXT:    slw 27, 6, 27
-; LE-32BIT-NEXT:    or 0, 0, 27
-; LE-32BIT-NEXT:    or 12, 28, 12
-; LE-32BIT-NEXT:    slw 4, 6, 4
-; LE-32BIT-NEXT:    or 3, 29, 3
-; LE-32BIT-NEXT:    or 9, 0, 9
-; LE-32BIT-NEXT:    or 12, 12, 30
-; LE-32BIT-NEXT:    bc 12, 4, .LBB7_4
-; LE-32BIT-NEXT:  # %bb.3:
-; LE-32BIT-NEXT:    ori 3, 8, 0
-; LE-32BIT-NEXT:    ori 8, 10, 0
-; LE-32BIT-NEXT:    b .LBB7_5
-; LE-32BIT-NEXT:  .LBB7_4:
-; LE-32BIT-NEXT:    addi 8, 9, 0
-; LE-32BIT-NEXT:  .LBB7_5:
-; LE-32BIT-NEXT:    or 4, 4, 12
-; LE-32BIT-NEXT:    stw 3, 8(5)
-; LE-32BIT-NEXT:    bc 12, 2, .LBB7_7
-; LE-32BIT-NEXT:  # %bb.6:
-; LE-32BIT-NEXT:    ori 3, 8, 0
-; LE-32BIT-NEXT:    b .LBB7_8
-; LE-32BIT-NEXT:  .LBB7_7:
-; LE-32BIT-NEXT:    addi 3, 7, 0
-; LE-32BIT-NEXT:  .LBB7_8:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB7_10
-; LE-32BIT-NEXT:  # %bb.9:
-; LE-32BIT-NEXT:    ori 4, 11, 0
-; LE-32BIT-NEXT:    b .LBB7_10
-; LE-32BIT-NEXT:  .LBB7_10:
+; LE-32BIT-NEXT:    lwz 4, 12(4)
+; LE-32BIT-NEXT:    stw 6, 44(1)
+; LE-32BIT-NEXT:    stw 6, 40(1)
+; LE-32BIT-NEXT:    clrlwi 4, 4, 28
+; LE-32BIT-NEXT:    stw 6, 36(1)
+; LE-32BIT-NEXT:    stw 6, 32(1)
+; LE-32BIT-NEXT:    stw 3, 28(1)
+; LE-32BIT-NEXT:    addi 3, 1, 16
+; LE-32BIT-NEXT:    stw 9, 24(1)
+; LE-32BIT-NEXT:    stw 8, 20(1)
+; LE-32BIT-NEXT:    stw 7, 16(1)
+; LE-32BIT-NEXT:    lwzux 3, 4, 3
+; LE-32BIT-NEXT:    lwz 6, 4(4)
+; LE-32BIT-NEXT:    lwz 7, 12(4)
+; LE-32BIT-NEXT:    lwz 4, 8(4)
 ; LE-32BIT-NEXT:    stw 3, 0(5)
-; LE-32BIT-NEXT:    bc 12, 2, .LBB7_12
-; LE-32BIT-NEXT:  # %bb.11:
-; LE-32BIT-NEXT:    ori 3, 4, 0
-; LE-32BIT-NEXT:    b .LBB7_13
-; LE-32BIT-NEXT:  .LBB7_12:
-; LE-32BIT-NEXT:    addi 3, 6, 0
-; LE-32BIT-NEXT:  .LBB7_13:
-; LE-32BIT-NEXT:    stw 3, 4(5)
-; LE-32BIT-NEXT:    lwz 30, 24(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 29, 20(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 28, 16(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 27, 12(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    addi 1, 1, 32
+; LE-32BIT-NEXT:    stw 4, 8(5)
+; LE-32BIT-NEXT:    stw 7, 12(5)
+; LE-32BIT-NEXT:    stw 6, 4(5)
+; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -500,95 +379,33 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; LE-32BIT-LABEL: ashr_16bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -32(1)
+; LE-32BIT-NEXT:    stwu 1, -48(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    addi 6, 1, 32
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 3, 12(3)
 ; LE-32BIT-NEXT:    lwz 4, 12(4)
-; LE-32BIT-NEXT:    lwz 8, 0(3)
-; LE-32BIT-NEXT:    lwz 9, 4(3)
-; LE-32BIT-NEXT:    lwz 6, 8(3)
-; LE-32BIT-NEXT:    lwz 7, 12(3)
-; LE-32BIT-NEXT:    rlwinm. 3, 4, 3, 0, 28
-; LE-32BIT-NEXT:    subfic 10, 3, 96
-; LE-32BIT-NEXT:    addi 11, 3, -64
-; LE-32BIT-NEXT:    stw 27, 12(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    addi 12, 3, -96
-; LE-32BIT-NEXT:    stw 30, 24(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 30, 3, 32
-; LE-32BIT-NEXT:    slw 10, 8, 10
-; LE-32BIT-NEXT:    srw 27, 9, 11
-; LE-32BIT-NEXT:    stw 26, 8(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    sraw 26, 8, 12
-; LE-32BIT-NEXT:    stw 28, 16(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 28, 9, 3
-; LE-32BIT-NEXT:    stw 29, 20(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    addi 29, 3, -32
-; LE-32BIT-NEXT:    cmpwi 1, 12, 1
-; LE-32BIT-NEXT:    slw 12, 8, 30
-; LE-32BIT-NEXT:    or 10, 27, 10
-; LE-32BIT-NEXT:    srw 0, 7, 3
-; LE-32BIT-NEXT:    sraw 27, 8, 29
-; LE-32BIT-NEXT:    bc 12, 4, .LBB8_2
-; LE-32BIT-NEXT:  # %bb.1:
-; LE-32BIT-NEXT:    ori 10, 26, 0
-; LE-32BIT-NEXT:    b .LBB8_2
-; LE-32BIT-NEXT:  .LBB8_2:
-; LE-32BIT-NEXT:    cmpwi 1, 29, 1
-; LE-32BIT-NEXT:    or 12, 28, 12
-; LE-32BIT-NEXT:    subfic 28, 3, 64
-; LE-32BIT-NEXT:    slw 26, 6, 30
-; LE-32BIT-NEXT:    srawi 4, 8, 31
-; LE-32BIT-NEXT:    bc 12, 4, .LBB8_4
-; LE-32BIT-NEXT:  # %bb.3:
-; LE-32BIT-NEXT:    ori 12, 27, 0
-; LE-32BIT-NEXT:    b .LBB8_4
-; LE-32BIT-NEXT:  .LBB8_4:
-; LE-32BIT-NEXT:    sraw 27, 8, 3
-; LE-32BIT-NEXT:    or 0, 0, 26
-; LE-32BIT-NEXT:    slw 26, 9, 28
-; LE-32BIT-NEXT:    sraw 11, 8, 11
-; LE-32BIT-NEXT:    slw 8, 8, 28
-; LE-32BIT-NEXT:    subfic 28, 28, 32
-; LE-32BIT-NEXT:    slw 30, 9, 30
-; LE-32BIT-NEXT:    srw 9, 9, 28
-; LE-32BIT-NEXT:    srw 29, 6, 29
-; LE-32BIT-NEXT:    or 8, 8, 9
-; LE-32BIT-NEXT:    cmplwi 1, 3, 64
-; LE-32BIT-NEXT:    or 0, 0, 29
-; LE-32BIT-NEXT:    srw 3, 6, 3
-; LE-32BIT-NEXT:    or 8, 8, 30
-; LE-32BIT-NEXT:    or 9, 0, 26
-; LE-32BIT-NEXT:    or 3, 3, 8
-; LE-32BIT-NEXT:    bc 12, 4, .LBB8_6
-; LE-32BIT-NEXT:  # %bb.5:
-; LE-32BIT-NEXT:    ori 28, 4, 0
-; LE-32BIT-NEXT:    ori 9, 10, 0
-; LE-32BIT-NEXT:    ori 3, 11, 0
-; LE-32BIT-NEXT:    b .LBB8_7
-; LE-32BIT-NEXT:  .LBB8_6:
-; LE-32BIT-NEXT:    addi 28, 27, 0
-; LE-32BIT-NEXT:    addi 4, 12, 0
-; LE-32BIT-NEXT:  .LBB8_7:
-; LE-32BIT-NEXT:    lwz 30, 24(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 2, .LBB8_8
-; LE-32BIT-NEXT:    b .LBB8_9
-; LE-32BIT-NEXT:  .LBB8_8:
-; LE-32BIT-NEXT:    addi 3, 6, 0
-; LE-32BIT-NEXT:  .LBB8_9:
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    srawi 3, 7, 31
+; LE-32BIT-NEXT:    clrlwi 4, 4, 28
+; LE-32BIT-NEXT:    stw 9, 40(1)
+; LE-32BIT-NEXT:    stw 8, 36(1)
+; LE-32BIT-NEXT:    stw 7, 32(1)
+; LE-32BIT-NEXT:    stw 3, 28(1)
+; LE-32BIT-NEXT:    stw 3, 24(1)
+; LE-32BIT-NEXT:    stw 3, 20(1)
+; LE-32BIT-NEXT:    stw 3, 16(1)
+; LE-32BIT-NEXT:    sub 3, 6, 4
+; LE-32BIT-NEXT:    lwz 4, 4(3)
+; LE-32BIT-NEXT:    lwz 6, 0(3)
+; LE-32BIT-NEXT:    lwz 7, 8(3)
+; LE-32BIT-NEXT:    lwz 3, 12(3)
+; LE-32BIT-NEXT:    stw 7, 8(5)
+; LE-32BIT-NEXT:    stw 3, 12(5)
+; LE-32BIT-NEXT:    stw 6, 0(5)
 ; LE-32BIT-NEXT:    stw 4, 4(5)
-; LE-32BIT-NEXT:    bc 12, 2, .LBB8_11
-; LE-32BIT-NEXT:  # %bb.10:
-; LE-32BIT-NEXT:    ori 4, 9, 0
-; LE-32BIT-NEXT:    b .LBB8_12
-; LE-32BIT-NEXT:  .LBB8_11:
-; LE-32BIT-NEXT:    addi 4, 7, 0
-; LE-32BIT-NEXT:  .LBB8_12:
-; LE-32BIT-NEXT:    stw 28, 0(5)
-; LE-32BIT-NEXT:    stw 4, 12(5)
-; LE-32BIT-NEXT:    stw 3, 8(5)
-; LE-32BIT-NEXT:    lwz 29, 20(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 28, 16(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 27, 12(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 26, 8(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    addi 1, 1, 32
+; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -601,583 +418,106 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-LABEL: lshr_32bytes:
 ; LE-64BIT:       # %bb.0:
-; LE-64BIT-NEXT:    lwz 4, 0(4)
-; LE-64BIT-NEXT:    ld 7, 0(3)
-; LE-64BIT-NEXT:    ld 8, 8(3)
-; LE-64BIT-NEXT:    ld 9, 16(3)
-; LE-64BIT-NEXT:    li 6, 0
-; LE-64BIT-NEXT:    ld 3, 24(3)
-; LE-64BIT-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 21, -88(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 24, -64(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    rlwinm. 4, 4, 3, 0, 28
-; LE-64BIT-NEXT:    std 25, -56(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    subfic 28, 4, 64
-; LE-64BIT-NEXT:    subfic 11, 4, 192
-; LE-64BIT-NEXT:    std 27, -40(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 29, -24(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    addi 0, 4, -128
-; LE-64BIT-NEXT:    srd 29, 9, 4
-; LE-64BIT-NEXT:    addi 27, 4, -64
-; LE-64BIT-NEXT:    subfic 25, 4, 128
-; LE-64BIT-NEXT:    sld 24, 8, 28
-; LE-64BIT-NEXT:    std 22, -80(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 26, -48(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    sld 21, 9, 28
-; LE-64BIT-NEXT:    sld 28, 3, 28
-; LE-64BIT-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    srd 10, 7, 4
-; LE-64BIT-NEXT:    addi 30, 4, -192
-; LE-64BIT-NEXT:    subfic 22, 25, 64
-; LE-64BIT-NEXT:    sld 11, 3, 11
-; LE-64BIT-NEXT:    srd 26, 9, 0
-; LE-64BIT-NEXT:    or 29, 29, 28
-; LE-64BIT-NEXT:    std 23, -72(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    or 10, 10, 24
-; LE-64BIT-NEXT:    srd 28, 3, 27
-; LE-64BIT-NEXT:    srd 30, 3, 30
-; LE-64BIT-NEXT:    or 11, 26, 11
-; LE-64BIT-NEXT:    ld 26, -48(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    srd 23, 8, 27
-; LE-64BIT-NEXT:    srd 27, 9, 22
-; LE-64BIT-NEXT:    or 29, 29, 28
-; LE-64BIT-NEXT:    or 11, 11, 30
-; LE-64BIT-NEXT:    ld 24, -64(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    sld 28, 3, 25
-; LE-64BIT-NEXT:    or 10, 10, 23
-; LE-64BIT-NEXT:    ld 23, -72(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 22, -80(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    sld 9, 9, 25
-; LE-64BIT-NEXT:    or 30, 28, 27
-; LE-64BIT-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 25, -56(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    cmplwi 1, 4, 128
-; LE-64BIT-NEXT:    srd 12, 8, 4
-; LE-64BIT-NEXT:    or 9, 10, 9
-; LE-64BIT-NEXT:    or 30, 30, 21
-; LE-64BIT-NEXT:    ld 21, -88(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    srd 10, 3, 0
-; LE-64BIT-NEXT:    isel 9, 9, 11, 4
-; LE-64BIT-NEXT:    or 11, 12, 30
-; LE-64BIT-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    iseleq 7, 7, 9
-; LE-64BIT-NEXT:    srd 3, 3, 4
-; LE-64BIT-NEXT:    isel 9, 11, 10, 4
-; LE-64BIT-NEXT:    std 7, 0(5)
-; LE-64BIT-NEXT:    isel 0, 29, 6, 4
-; LE-64BIT-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    iseleq 4, 8, 9
-; LE-64BIT-NEXT:    std 0, 16(5)
-; LE-64BIT-NEXT:    isel 3, 3, 6, 4
-; LE-64BIT-NEXT:    std 4, 8(5)
-; LE-64BIT-NEXT:    std 3, 24(5)
+; LE-64BIT-NEXT:    li 6, 16
+; LE-64BIT-NEXT:    lxvd2x 1, 0, 3
+; LE-64BIT-NEXT:    xxlxor 2, 2, 2
+; LE-64BIT-NEXT:    addi 7, 1, -64
+; LE-64BIT-NEXT:    li 8, 32
+; LE-64BIT-NEXT:    lxvd2x 0, 3, 6
+; LE-64BIT-NEXT:    lwz 3, 0(4)
+; LE-64BIT-NEXT:    li 4, 48
+; LE-64BIT-NEXT:    stxvd2x 2, 7, 4
+; LE-64BIT-NEXT:    stxvd2x 2, 7, 8
+; LE-64BIT-NEXT:    clrldi 3, 3, 59
+; LE-64BIT-NEXT:    stxvd2x 0, 7, 6
+; LE-64BIT-NEXT:    stxvd2x 1, 0, 7
+; LE-64BIT-NEXT:    add 4, 7, 3
+; LE-64BIT-NEXT:    lxvd2x 0, 7, 3
+; LE-64BIT-NEXT:    lxvd2x 1, 4, 6
+; LE-64BIT-NEXT:    stxvd2x 1, 5, 6
+; LE-64BIT-NEXT:    stxvd2x 0, 0, 5
 ; LE-64BIT-NEXT:    blr
 ;
 ; BE-LABEL: lshr_32bytes:
 ; BE:       # %bb.0:
+; BE-NEXT:    ld 6, 0(3)
+; BE-NEXT:    ld 7, 8(3)
+; BE-NEXT:    ld 8, 16(3)
+; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    ld 7, 16(3)
-; BE-NEXT:    ld 8, 24(3)
-; BE-NEXT:    ld 9, 8(3)
-; BE-NEXT:    ld 3, 0(3)
-; BE-NEXT:    std 27, -40(1) # 8-byte Folded Spill
-; BE-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; BE-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; BE-NEXT:    std 29, -24(1) # 8-byte Folded Spill
-; BE-NEXT:    li 6, 0
-; BE-NEXT:    rlwinm. 4, 4, 3, 0, 28
-; BE-NEXT:    subfic 10, 4, 192
-; BE-NEXT:    addi 11, 4, -128
-; BE-NEXT:    addi 12, 4, -192
-; BE-NEXT:    subfic 30, 4, 64
-; BE-NEXT:    sld 10, 3, 10
-; BE-NEXT:    srd 27, 9, 11
-; BE-NEXT:    srd 0, 8, 4
-; BE-NEXT:    addi 29, 4, -64
-; BE-NEXT:    subfic 28, 4, 128
-; BE-NEXT:    srd 12, 3, 12
-; BE-NEXT:    or 10, 27, 10
-; BE-NEXT:    sld 27, 7, 30
-; BE-NEXT:    or 10, 10, 12
-; BE-NEXT:    or 0, 0, 27
-; BE-NEXT:    srd 27, 7, 29
-; BE-NEXT:    subfic 12, 28, 64
-; BE-NEXT:    or 0, 0, 27
-; BE-NEXT:    sld 27, 3, 28
-; BE-NEXT:    srd 12, 9, 12
-; BE-NEXT:    sld 28, 9, 28
-; BE-NEXT:    cmplwi 1, 4, 128
-; BE-NEXT:    or 12, 27, 12
-; BE-NEXT:    or 28, 0, 28
-; BE-NEXT:    sld 0, 9, 30
-; BE-NEXT:    srd 9, 9, 4
-; BE-NEXT:    srd 11, 3, 11
-; BE-NEXT:    bc 12, 4, .LBB9_1
-; BE-NEXT:    b .LBB9_2
-; BE-NEXT:  .LBB9_1:
-; BE-NEXT:    addi 10, 28, 0
-; BE-NEXT:  .LBB9_2:
-; BE-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
-; BE-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
-; BE-NEXT:    or 12, 12, 0
-; BE-NEXT:    srd 0, 7, 4
-; BE-NEXT:    or 12, 0, 12
-; BE-NEXT:    sld 0, 3, 30
-; BE-NEXT:    srd 30, 3, 29
-; BE-NEXT:    bc 12, 4, .LBB9_3
-; BE-NEXT:    b .LBB9_4
-; BE-NEXT:  .LBB9_3:
-; BE-NEXT:    addi 11, 12, 0
-; BE-NEXT:  .LBB9_4:
-; BE-NEXT:    srd 3, 3, 4
-; BE-NEXT:    bc 12, 2, .LBB9_6
-; BE-NEXT:  # %bb.5:
-; BE-NEXT:    ori 4, 10, 0
-; BE-NEXT:    b .LBB9_7
-; BE-NEXT:  .LBB9_6:
-; BE-NEXT:    addi 4, 8, 0
-; BE-NEXT:  .LBB9_7:
-; BE-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
-; BE-NEXT:    or 9, 9, 0
-; BE-NEXT:    or 9, 9, 30
-; BE-NEXT:    bc 12, 2, .LBB9_9
-; BE-NEXT:  # %bb.8:
-; BE-NEXT:    ori 7, 11, 0
-; BE-NEXT:    b .LBB9_9
-; BE-NEXT:  .LBB9_9:
-; BE-NEXT:    bc 12, 4, .LBB9_11
-; BE-NEXT:  # %bb.10:
-; BE-NEXT:    ori 8, 6, 0
-; BE-NEXT:    ori 3, 6, 0
-; BE-NEXT:    b .LBB9_12
-; BE-NEXT:  .LBB9_11:
-; BE-NEXT:    addi 8, 9, 0
-; BE-NEXT:  .LBB9_12:
-; BE-NEXT:    std 4, 24(5)
-; BE-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
+; BE-NEXT:    addi 9, 1, -64
+; BE-NEXT:    li 10, 0
+; BE-NEXT:    std 10, 24(9)
+; BE-NEXT:    std 10, 16(9)
+; BE-NEXT:    std 10, 8(9)
+; BE-NEXT:    std 10, -64(1)
+; BE-NEXT:    std 3, 56(9)
+; BE-NEXT:    clrlwi 3, 4, 27
+; BE-NEXT:    neg 3, 3
+; BE-NEXT:    std 8, 48(9)
+; BE-NEXT:    std 7, 40(9)
+; BE-NEXT:    std 6, 32(9)
+; BE-NEXT:    extsw 3, 3
+; BE-NEXT:    addi 4, 1, -32
+; BE-NEXT:    ldux 3, 4, 3
+; BE-NEXT:    ld 6, 8(4)
+; BE-NEXT:    ld 7, 24(4)
+; BE-NEXT:    ld 4, 16(4)
 ; BE-NEXT:    std 3, 0(5)
-; BE-NEXT:    std 8, 8(5)
-; BE-NEXT:    std 7, 16(5)
+; BE-NEXT:    std 4, 16(5)
+; BE-NEXT:    std 7, 24(5)
+; BE-NEXT:    std 6, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: lshr_32bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -144(1)
-; LE-32BIT-NEXT:    mfcr 12
-; LE-32BIT-NEXT:    stw 14, 72(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 15, 76(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 16, 80(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 17, 84(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 18, 88(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 19, 92(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 20, 96(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 21, 100(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 22, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 23, 108(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 24, 112(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 25, 116(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 26, 120(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 27, 124(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 28, 128(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 29, 132(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 30, 136(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 31, 140(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stwu 1, -80(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    li 6, 0
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 10, 12(3)
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    lwz 12, 20(3)
+; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    lwz 4, 28(4)
+; LE-32BIT-NEXT:    stw 3, 76(1)
+; LE-32BIT-NEXT:    addi 3, 1, 48
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
+; LE-32BIT-NEXT:    stw 6, 44(1)
+; LE-32BIT-NEXT:    sub 3, 3, 4
+; LE-32BIT-NEXT:    stw 6, 40(1)
+; LE-32BIT-NEXT:    stw 6, 36(1)
+; LE-32BIT-NEXT:    stw 6, 32(1)
+; LE-32BIT-NEXT:    stw 6, 28(1)
+; LE-32BIT-NEXT:    stw 6, 24(1)
+; LE-32BIT-NEXT:    stw 6, 20(1)
+; LE-32BIT-NEXT:    stw 6, 16(1)
+; LE-32BIT-NEXT:    stw 0, 72(1)
 ; LE-32BIT-NEXT:    stw 12, 68(1)
-; LE-32BIT-NEXT:    lwz 0, 28(4)
-; LE-32BIT-NEXT:    lwz 11, 4(3)
+; LE-32BIT-NEXT:    stw 11, 64(1)
+; LE-32BIT-NEXT:    stw 10, 60(1)
+; LE-32BIT-NEXT:    stw 9, 56(1)
+; LE-32BIT-NEXT:    stw 8, 52(1)
+; LE-32BIT-NEXT:    stw 7, 48(1)
+; LE-32BIT-NEXT:    lwz 4, 4(3)
 ; LE-32BIT-NEXT:    lwz 6, 0(3)
-; LE-32BIT-NEXT:    rlwinm. 30, 0, 3, 0, 28
-; LE-32BIT-NEXT:    stw 5, 64(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 21, 30, 224
-; LE-32BIT-NEXT:    lwz 5, 24(3)
-; LE-32BIT-NEXT:    subfic 4, 30, 160
-; LE-32BIT-NEXT:    lwz 7, 28(3)
-; LE-32BIT-NEXT:    addi 0, 30, -128
-; LE-32BIT-NEXT:    lwz 10, 20(3)
-; LE-32BIT-NEXT:    subfic 28, 30, 96
-; LE-32BIT-NEXT:    lwz 8, 16(3)
-; LE-32BIT-NEXT:    addi 29, 30, -64
-; LE-32BIT-NEXT:    lwz 27, 12(3)
-; LE-32BIT-NEXT:    subfic 12, 30, 32
-; LE-32BIT-NEXT:    lwz 9, 8(3)
-; LE-32BIT-NEXT:    addi 3, 30, -192
-; LE-32BIT-NEXT:    slw 21, 6, 21
-; LE-32BIT-NEXT:    srw 16, 11, 3
-; LE-32BIT-NEXT:    stw 3, 56(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 20, 7, 30
-; LE-32BIT-NEXT:    slw 15, 9, 4
-; LE-32BIT-NEXT:    srw 14, 27, 0
-; LE-32BIT-NEXT:    slw 31, 8, 28
-; LE-32BIT-NEXT:    srw 3, 10, 29
-; LE-32BIT-NEXT:    or 21, 16, 21
-; LE-32BIT-NEXT:    slw 16, 5, 12
-; LE-32BIT-NEXT:    srw 19, 10, 30
-; LE-32BIT-NEXT:    or 15, 14, 15
-; LE-32BIT-NEXT:    slw 14, 8, 12
-; LE-32BIT-NEXT:    or 3, 3, 31
-; LE-32BIT-NEXT:    slw 31, 6, 4
-; LE-32BIT-NEXT:    or 20, 20, 16
-; LE-32BIT-NEXT:    srw 16, 11, 0
-; LE-32BIT-NEXT:    stw 7, 60(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    addi 26, 30, -224
-; LE-32BIT-NEXT:    mr 7, 10
-; LE-32BIT-NEXT:    mr 10, 12
-; LE-32BIT-NEXT:    or 19, 19, 14
-; LE-32BIT-NEXT:    slw 14, 6, 28
-; LE-32BIT-NEXT:    or 16, 16, 31
-; LE-32BIT-NEXT:    srw 31, 11, 29
-; LE-32BIT-NEXT:    addi 23, 30, -160
-; LE-32BIT-NEXT:    srw 18, 27, 30
-; LE-32BIT-NEXT:    stw 0, 40(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    mr 12, 28
-; LE-32BIT-NEXT:    or 14, 31, 14
-; LE-32BIT-NEXT:    mr 28, 9
-; LE-32BIT-NEXT:    slw 31, 9, 10
-; LE-32BIT-NEXT:    srw 0, 6, 26
-; LE-32BIT-NEXT:    addi 25, 30, -96
-; LE-32BIT-NEXT:    srw 17, 11, 30
-; LE-32BIT-NEXT:    stw 4, 36(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 18, 18, 31
-; LE-32BIT-NEXT:    slw 31, 6, 10
-; LE-32BIT-NEXT:    or 4, 21, 0
-; LE-32BIT-NEXT:    srw 0, 28, 23
-; LE-32BIT-NEXT:    or 17, 17, 31
-; LE-32BIT-NEXT:    addi 31, 30, -32
-; LE-32BIT-NEXT:    or 0, 15, 0
-; LE-32BIT-NEXT:    srw 15, 8, 25
-; LE-32BIT-NEXT:    or 3, 3, 15
-; LE-32BIT-NEXT:    srw 15, 5, 31
-; LE-32BIT-NEXT:    or 20, 20, 15
-; LE-32BIT-NEXT:    srw 15, 8, 31
-; LE-32BIT-NEXT:    stw 3, 24(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 3, 19, 15
-; LE-32BIT-NEXT:    srw 23, 6, 23
-; LE-32BIT-NEXT:    stw 3, 48(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 15, 30, 64
-; LE-32BIT-NEXT:    or 3, 16, 23
-; LE-32BIT-NEXT:    stw 3, 44(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 3, 15, 32
-; LE-32BIT-NEXT:    slw 16, 28, 15
-; LE-32BIT-NEXT:    srw 22, 27, 3
-; LE-32BIT-NEXT:    stw 4, 32(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 4, 16, 22
-; LE-32BIT-NEXT:    subfic 16, 30, 128
-; LE-32BIT-NEXT:    stw 5, 28(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 5, 16, 32
-; LE-32BIT-NEXT:    stw 4, 20(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    slw 4, 6, 16
-; LE-32BIT-NEXT:    srw 24, 11, 5
-; LE-32BIT-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    mr 29, 27
-; LE-32BIT-NEXT:    or 22, 4, 24
-; LE-32BIT-NEXT:    slw 24, 28, 16
-; LE-32BIT-NEXT:    srw 27, 27, 5
-; LE-32BIT-NEXT:    or 27, 24, 27
-; LE-32BIT-NEXT:    slw 24, 8, 15
-; LE-32BIT-NEXT:    srw 26, 7, 3
-; LE-32BIT-NEXT:    or 26, 24, 26
-; LE-32BIT-NEXT:    subfic 24, 30, 192
-; LE-32BIT-NEXT:    mr 9, 10
-; LE-32BIT-NEXT:    mr 10, 28
-; LE-32BIT-NEXT:    subfic 28, 24, 32
-; LE-32BIT-NEXT:    srw 28, 11, 28
-; LE-32BIT-NEXT:    slw 19, 6, 24
-; LE-32BIT-NEXT:    or 28, 19, 28
-; LE-32BIT-NEXT:    srw 19, 6, 25
-; LE-32BIT-NEXT:    or 19, 14, 19
-; LE-32BIT-NEXT:    srw 14, 10, 31
-; LE-32BIT-NEXT:    lwz 4, 64(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 18, 18, 14
-; LE-32BIT-NEXT:    srw 3, 11, 3
-; LE-32BIT-NEXT:    slw 14, 6, 15
-; LE-32BIT-NEXT:    cmplwi 5, 30, 64
-; LE-32BIT-NEXT:    cmplwi 1, 30, 128
-; LE-32BIT-NEXT:    slw 24, 11, 24
-; LE-32BIT-NEXT:    mr 21, 8
-; LE-32BIT-NEXT:    or 8, 14, 3
-; LE-32BIT-NEXT:    srw 14, 6, 31
-; LE-32BIT-NEXT:    crnand 21, 4, 20
-; LE-32BIT-NEXT:    srw 31, 6, 30
-; LE-32BIT-NEXT:    or 24, 0, 24
-; LE-32BIT-NEXT:    slw 0, 7, 15
-; LE-32BIT-NEXT:    mr 23, 7
-; LE-32BIT-NEXT:    or 17, 17, 14
-; LE-32BIT-NEXT:    bc 12, 21, .LBB9_2
-; LE-32BIT-NEXT:  # %bb.1:
-; LE-32BIT-NEXT:    ori 14, 31, 0
-; LE-32BIT-NEXT:    b .LBB9_3
-; LE-32BIT-NEXT:  .LBB9_2:
-; LE-32BIT-NEXT:    li 14, 0
-; LE-32BIT-NEXT:  .LBB9_3:
-; LE-32BIT-NEXT:    or 20, 20, 0
-; LE-32BIT-NEXT:    subfic 0, 16, 64
-; LE-32BIT-NEXT:    lwz 7, 20(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    srw 31, 29, 0
-; LE-32BIT-NEXT:    stw 14, 0(4)
-; LE-32BIT-NEXT:    subfic 14, 0, 32
-; LE-32BIT-NEXT:    slw 14, 10, 14
-; LE-32BIT-NEXT:    or 14, 31, 14
-; LE-32BIT-NEXT:    slw 31, 29, 9
-; LE-32BIT-NEXT:    lwz 3, 36(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 7, 7, 31
-; LE-32BIT-NEXT:    slw 31, 11, 12
-; LE-32BIT-NEXT:    stw 7, 20(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 7, 22, 31
-; LE-32BIT-NEXT:    slw 31, 29, 12
-; LE-32BIT-NEXT:    or 27, 27, 31
-; LE-32BIT-NEXT:    slw 31, 23, 9
-; LE-32BIT-NEXT:    or 26, 26, 31
-; LE-32BIT-NEXT:    slw 31, 11, 3
-; LE-32BIT-NEXT:    or 28, 28, 31
-; LE-32BIT-NEXT:    slw 31, 11, 15
-; LE-32BIT-NEXT:    lwz 22, 28(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 18, 18, 31
-; LE-32BIT-NEXT:    lwz 31, 40(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    srw 0, 10, 0
-; LE-32BIT-NEXT:    or 7, 7, 0
-; LE-32BIT-NEXT:    srw 0, 22, 30
-; LE-32BIT-NEXT:    slw 25, 11, 9
-; LE-32BIT-NEXT:    or 26, 0, 26
-; LE-32BIT-NEXT:    srw 0, 10, 31
-; LE-32BIT-NEXT:    or 3, 8, 25
-; LE-32BIT-NEXT:    or 28, 0, 28
-; LE-32BIT-NEXT:    srw 0, 10, 30
-; LE-32BIT-NEXT:    srw 5, 10, 5
-; LE-32BIT-NEXT:    or 3, 0, 3
-; LE-32BIT-NEXT:    bc 12, 21, .LBB9_5
-; LE-32BIT-NEXT:  # %bb.4:
-; LE-32BIT-NEXT:    ori 0, 17, 0
-; LE-32BIT-NEXT:    b .LBB9_6
-; LE-32BIT-NEXT:  .LBB9_5:
-; LE-32BIT-NEXT:    li 0, 0
-; LE-32BIT-NEXT:  .LBB9_6:
-; LE-32BIT-NEXT:    lwz 8, 32(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 5, 14, 5
-; LE-32BIT-NEXT:    mr 14, 4
-; LE-32BIT-NEXT:    stw 0, 4(4)
-; LE-32BIT-NEXT:    slw 0, 11, 16
-; LE-32BIT-NEXT:    lwz 4, 52(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    cmplwi 6, 31, 64
-; LE-32BIT-NEXT:    mr 9, 21
-; LE-32BIT-NEXT:    or 5, 0, 5
-; LE-32BIT-NEXT:    lwz 0, 56(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 24, .LBB9_8
-; LE-32BIT-NEXT:  # %bb.7:
-; LE-32BIT-NEXT:    ori 25, 8, 0
-; LE-32BIT-NEXT:    b .LBB9_9
-; LE-32BIT-NEXT:  .LBB9_8:
-; LE-32BIT-NEXT:    addi 25, 24, 0
-; LE-32BIT-NEXT:  .LBB9_9:
-; LE-32BIT-NEXT:    bc 12, 20, .LBB9_11
-; LE-32BIT-NEXT:  # %bb.10:
-; LE-32BIT-NEXT:    ori 24, 19, 0
-; LE-32BIT-NEXT:    b .LBB9_12
-; LE-32BIT-NEXT:  .LBB9_11:
-; LE-32BIT-NEXT:    addi 24, 18, 0
-; LE-32BIT-NEXT:  .LBB9_12:
-; LE-32BIT-NEXT:    srw 19, 9, 4
-; LE-32BIT-NEXT:    srw 17, 6, 4
-; LE-32BIT-NEXT:    lwz 4, 20(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    srw 30, 21, 30
-; LE-32BIT-NEXT:    lwz 8, 24(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    slw 21, 29, 16
-; LE-32BIT-NEXT:    cmplwi 2, 16, 64
-; LE-32BIT-NEXT:    cmplwi 3, 16, 0
-; LE-32BIT-NEXT:    li 16, 0
-; LE-32BIT-NEXT:    srw 18, 6, 0
-; LE-32BIT-NEXT:    bc 12, 8, .LBB9_14
-; LE-32BIT-NEXT:  # %bb.13:
-; LE-32BIT-NEXT:    ori 0, 16, 0
-; LE-32BIT-NEXT:    b .LBB9_15
-; LE-32BIT-NEXT:  .LBB9_14:
-; LE-32BIT-NEXT:    addi 0, 21, 0
-; LE-32BIT-NEXT:  .LBB9_15:
-; LE-32BIT-NEXT:    lwz 21, 60(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 8, .LBB9_16
-; LE-32BIT-NEXT:    b .LBB9_17
-; LE-32BIT-NEXT:  .LBB9_16:
-; LE-32BIT-NEXT:    addi 4, 7, 0
-; LE-32BIT-NEXT:  .LBB9_17:
-; LE-32BIT-NEXT:    bc 12, 20, .LBB9_18
-; LE-32BIT-NEXT:    b .LBB9_19
-; LE-32BIT-NEXT:  .LBB9_18:
-; LE-32BIT-NEXT:    addi 8, 20, 0
-; LE-32BIT-NEXT:  .LBB9_19:
-; LE-32BIT-NEXT:    mr 12, 29
-; LE-32BIT-NEXT:    lwz 7, 48(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    slw 20, 29, 15
-; LE-32BIT-NEXT:    srw 29, 6, 31
-; LE-32BIT-NEXT:    bc 12, 2, .LBB9_20
-; LE-32BIT-NEXT:    b .LBB9_21
-; LE-32BIT-NEXT:  .LBB9_20:
-; LE-32BIT-NEXT:    addi 8, 21, 0
-; LE-32BIT-NEXT:  .LBB9_21:
-; LE-32BIT-NEXT:    cmplwi 7, 31, 0
-; LE-32BIT-NEXT:    bc 12, 20, .LBB9_23
-; LE-32BIT-NEXT:  # %bb.22:
-; LE-32BIT-NEXT:    ori 26, 19, 0
-; LE-32BIT-NEXT:    ori 3, 17, 0
-; LE-32BIT-NEXT:    b .LBB9_23
-; LE-32BIT-NEXT:  .LBB9_23:
-; LE-32BIT-NEXT:    or 8, 8, 0
-; LE-32BIT-NEXT:    bc 12, 20, .LBB9_25
-; LE-32BIT-NEXT:  # %bb.24:
-; LE-32BIT-NEXT:    ori 0, 16, 0
-; LE-32BIT-NEXT:    b .LBB9_26
-; LE-32BIT-NEXT:  .LBB9_25:
-; LE-32BIT-NEXT:    addi 0, 30, 0
-; LE-32BIT-NEXT:  .LBB9_26:
-; LE-32BIT-NEXT:    bc 12, 24, .LBB9_28
-; LE-32BIT-NEXT:  # %bb.27:
-; LE-32BIT-NEXT:    ori 30, 16, 0
-; LE-32BIT-NEXT:    b .LBB9_29
-; LE-32BIT-NEXT:  .LBB9_28:
-; LE-32BIT-NEXT:    addi 30, 29, 0
-; LE-32BIT-NEXT:  .LBB9_29:
-; LE-32BIT-NEXT:    bc 12, 20, .LBB9_31
-; LE-32BIT-NEXT:  # %bb.30:
-; LE-32BIT-NEXT:    ori 29, 16, 0
-; LE-32BIT-NEXT:    b .LBB9_32
-; LE-32BIT-NEXT:  .LBB9_31:
-; LE-32BIT-NEXT:    addi 29, 7, 0
-; LE-32BIT-NEXT:  .LBB9_32:
-; LE-32BIT-NEXT:    lwz 7, 44(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 30, .LBB9_33
-; LE-32BIT-NEXT:    b .LBB9_34
-; LE-32BIT-NEXT:  .LBB9_33:
-; LE-32BIT-NEXT:    addi 25, 12, 0
-; LE-32BIT-NEXT:  .LBB9_34:
-; LE-32BIT-NEXT:    bc 12, 14, .LBB9_35
-; LE-32BIT-NEXT:    b .LBB9_36
-; LE-32BIT-NEXT:  .LBB9_35:
-; LE-32BIT-NEXT:    addi 4, 6, 0
-; LE-32BIT-NEXT:  .LBB9_36:
-; LE-32BIT-NEXT:    bc 12, 2, .LBB9_38
-; LE-32BIT-NEXT:  # %bb.37:
-; LE-32BIT-NEXT:    ori 6, 26, 0
-; LE-32BIT-NEXT:    b .LBB9_39
-; LE-32BIT-NEXT:  .LBB9_38:
-; LE-32BIT-NEXT:    addi 6, 22, 0
-; LE-32BIT-NEXT:  .LBB9_39:
-; LE-32BIT-NEXT:    li 26, 0
-; LE-32BIT-NEXT:    bc 12, 2, .LBB9_40
-; LE-32BIT-NEXT:    b .LBB9_41
-; LE-32BIT-NEXT:  .LBB9_40:
-; LE-32BIT-NEXT:    addi 3, 10, 0
-; LE-32BIT-NEXT:  .LBB9_41:
-; LE-32BIT-NEXT:    bc 12, 8, .LBB9_43
-; LE-32BIT-NEXT:  # %bb.42:
-; LE-32BIT-NEXT:    ori 5, 20, 0
-; LE-32BIT-NEXT:    b .LBB9_43
-; LE-32BIT-NEXT:  .LBB9_43:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB9_45
-; LE-32BIT-NEXT:  # %bb.44:
-; LE-32BIT-NEXT:    ori 8, 25, 0
-; LE-32BIT-NEXT:    b .LBB9_45
-; LE-32BIT-NEXT:  .LBB9_45:
-; LE-32BIT-NEXT:    bc 12, 24, .LBB9_47
-; LE-32BIT-NEXT:  # %bb.46:
-; LE-32BIT-NEXT:    ori 28, 18, 0
-; LE-32BIT-NEXT:    b .LBB9_47
-; LE-32BIT-NEXT:  .LBB9_47:
-; LE-32BIT-NEXT:    bc 12, 8, .LBB9_49
-; LE-32BIT-NEXT:  # %bb.48:
-; LE-32BIT-NEXT:    ori 27, 16, 0
-; LE-32BIT-NEXT:    b .LBB9_49
-; LE-32BIT-NEXT:  .LBB9_49:
-; LE-32BIT-NEXT:    bc 12, 2, .LBB9_51
-; LE-32BIT-NEXT:  # %bb.50:
-; LE-32BIT-NEXT:    ori 12, 24, 0
-; LE-32BIT-NEXT:    b .LBB9_51
-; LE-32BIT-NEXT:  .LBB9_51:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB9_53
-; LE-32BIT-NEXT:  # %bb.52:
-; LE-32BIT-NEXT:    ori 3, 26, 0
-; LE-32BIT-NEXT:    b .LBB9_53
-; LE-32BIT-NEXT:  .LBB9_53:
-; LE-32BIT-NEXT:    bc 12, 14, .LBB9_54
-; LE-32BIT-NEXT:    b .LBB9_55
-; LE-32BIT-NEXT:  .LBB9_54:
-; LE-32BIT-NEXT:    addi 5, 11, 0
-; LE-32BIT-NEXT:  .LBB9_55:
-; LE-32BIT-NEXT:    bc 12, 30, .LBB9_56
-; LE-32BIT-NEXT:    b .LBB9_57
-; LE-32BIT-NEXT:  .LBB9_56:
-; LE-32BIT-NEXT:    addi 28, 10, 0
-; LE-32BIT-NEXT:  .LBB9_57:
-; LE-32BIT-NEXT:    or 6, 6, 27
-; LE-32BIT-NEXT:    stw 3, 8(14)
-; LE-32BIT-NEXT:    or 3, 0, 4
-; LE-32BIT-NEXT:    bc 12, 2, .LBB9_59
-; LE-32BIT-NEXT:  # %bb.58:
-; LE-32BIT-NEXT:    ori 4, 8, 0
-; LE-32BIT-NEXT:    b .LBB9_60
-; LE-32BIT-NEXT:  .LBB9_59:
-; LE-32BIT-NEXT:    addi 4, 21, 0
-; LE-32BIT-NEXT:  .LBB9_60:
-; LE-32BIT-NEXT:    bc 12, 24, .LBB9_62
-; LE-32BIT-NEXT:  # %bb.61:
-; LE-32BIT-NEXT:    ori 24, 16, 0
-; LE-32BIT-NEXT:    b .LBB9_63
-; LE-32BIT-NEXT:  .LBB9_62:
-; LE-32BIT-NEXT:    addi 24, 7, 0
-; LE-32BIT-NEXT:  .LBB9_63:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB9_65
-; LE-32BIT-NEXT:  # %bb.64:
-; LE-32BIT-NEXT:    ori 3, 30, 0
-; LE-32BIT-NEXT:    ori 6, 28, 0
-; LE-32BIT-NEXT:    ori 12, 16, 0
-; LE-32BIT-NEXT:    b .LBB9_65
-; LE-32BIT-NEXT:  .LBB9_65:
-; LE-32BIT-NEXT:    stw 4, 28(14)
-; LE-32BIT-NEXT:    or 4, 29, 5
-; LE-32BIT-NEXT:    bc 12, 4, .LBB9_67
-; LE-32BIT-NEXT:  # %bb.66:
-; LE-32BIT-NEXT:    ori 4, 24, 0
-; LE-32BIT-NEXT:    b .LBB9_67
-; LE-32BIT-NEXT:  .LBB9_67:
-; LE-32BIT-NEXT:    bc 12, 2, .LBB9_69
-; LE-32BIT-NEXT:  # %bb.68:
-; LE-32BIT-NEXT:    ori 5, 6, 0
-; LE-32BIT-NEXT:    b .LBB9_70
-; LE-32BIT-NEXT:  .LBB9_69:
-; LE-32BIT-NEXT:    addi 3, 9, 0
-; LE-32BIT-NEXT:    addi 5, 22, 0
-; LE-32BIT-NEXT:  .LBB9_70:
-; LE-32BIT-NEXT:    stw 12, 12(14)
-; LE-32BIT-NEXT:    stw 3, 16(14)
-; LE-32BIT-NEXT:    bc 12, 2, .LBB9_72
-; LE-32BIT-NEXT:  # %bb.71:
-; LE-32BIT-NEXT:    ori 3, 4, 0
-; LE-32BIT-NEXT:    b .LBB9_73
-; LE-32BIT-NEXT:  .LBB9_72:
-; LE-32BIT-NEXT:    addi 3, 23, 0
-; LE-32BIT-NEXT:  .LBB9_73:
-; LE-32BIT-NEXT:    stw 5, 24(14)
-; LE-32BIT-NEXT:    stw 3, 20(14)
-; LE-32BIT-NEXT:    lwz 12, 68(1)
-; LE-32BIT-NEXT:    lwz 31, 140(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    mtcrf 32, 12 # cr2
-; LE-32BIT-NEXT:    mtcrf 16, 12 # cr3
-; LE-32BIT-NEXT:    lwz 30, 136(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 29, 132(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 28, 128(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 27, 124(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 26, 120(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 25, 116(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 24, 112(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 23, 108(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 22, 104(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 21, 100(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 20, 96(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 19, 92(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 18, 88(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 17, 84(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 16, 80(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 15, 76(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 14, 72(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    addi 1, 1, 144
+; LE-32BIT-NEXT:    lwz 7, 12(3)
+; LE-32BIT-NEXT:    lwz 8, 8(3)
+; LE-32BIT-NEXT:    lwz 9, 20(3)
+; LE-32BIT-NEXT:    lwz 10, 16(3)
+; LE-32BIT-NEXT:    lwz 11, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    stw 11, 24(5)
+; LE-32BIT-NEXT:    stw 3, 28(5)
+; LE-32BIT-NEXT:    stw 10, 16(5)
+; LE-32BIT-NEXT:    stw 9, 20(5)
+; LE-32BIT-NEXT:    stw 8, 8(5)
+; LE-32BIT-NEXT:    stw 7, 12(5)
+; LE-32BIT-NEXT:    stw 6, 0(5)
+; LE-32BIT-NEXT:    stw 4, 4(5)
+; LE-32BIT-NEXT:    addi 1, 1, 80
 ; LE-32BIT-NEXT:    blr
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -1189,582 +529,105 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-LABEL: shl_32bytes:
 ; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    li 6, 16
 ; LE-64BIT-NEXT:    lwz 4, 0(4)
-; LE-64BIT-NEXT:    ld 7, 24(3)
-; LE-64BIT-NEXT:    ld 8, 16(3)
-; LE-64BIT-NEXT:    ld 9, 8(3)
-; LE-64BIT-NEXT:    li 6, 0
-; LE-64BIT-NEXT:    ld 3, 0(3)
-; LE-64BIT-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 21, -88(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 24, -64(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    rlwinm. 4, 4, 3, 0, 28
-; LE-64BIT-NEXT:    std 25, -56(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    subfic 28, 4, 64
-; LE-64BIT-NEXT:    subfic 11, 4, 192
-; LE-64BIT-NEXT:    std 27, -40(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 29, -24(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    addi 0, 4, -128
-; LE-64BIT-NEXT:    sld 29, 9, 4
-; LE-64BIT-NEXT:    addi 27, 4, -64
-; LE-64BIT-NEXT:    subfic 25, 4, 128
-; LE-64BIT-NEXT:    srd 24, 8, 28
-; LE-64BIT-NEXT:    std 22, -80(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 26, -48(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    srd 21, 9, 28
-; LE-64BIT-NEXT:    srd 28, 3, 28
-; LE-64BIT-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    sld 10, 7, 4
-; LE-64BIT-NEXT:    addi 30, 4, -192
-; LE-64BIT-NEXT:    subfic 22, 25, 64
-; LE-64BIT-NEXT:    srd 11, 3, 11
-; LE-64BIT-NEXT:    sld 26, 9, 0
-; LE-64BIT-NEXT:    or 29, 29, 28
-; LE-64BIT-NEXT:    std 23, -72(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    or 10, 10, 24
-; LE-64BIT-NEXT:    sld 28, 3, 27
-; LE-64BIT-NEXT:    sld 30, 3, 30
-; LE-64BIT-NEXT:    or 11, 26, 11
-; LE-64BIT-NEXT:    ld 26, -48(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    sld 23, 8, 27
-; LE-64BIT-NEXT:    sld 27, 9, 22
-; LE-64BIT-NEXT:    or 29, 29, 28
-; LE-64BIT-NEXT:    or 11, 11, 30
-; LE-64BIT-NEXT:    ld 24, -64(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    srd 28, 3, 25
-; LE-64BIT-NEXT:    or 10, 10, 23
-; LE-64BIT-NEXT:    ld 23, -72(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 22, -80(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    srd 9, 9, 25
-; LE-64BIT-NEXT:    or 30, 28, 27
-; LE-64BIT-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 25, -56(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    cmplwi 1, 4, 128
-; LE-64BIT-NEXT:    sld 12, 8, 4
-; LE-64BIT-NEXT:    or 9, 10, 9
-; LE-64BIT-NEXT:    or 30, 30, 21
-; LE-64BIT-NEXT:    ld 21, -88(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    sld 10, 3, 0
-; LE-64BIT-NEXT:    isel 9, 9, 11, 4
-; LE-64BIT-NEXT:    or 11, 12, 30
-; LE-64BIT-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    iseleq 7, 7, 9
-; LE-64BIT-NEXT:    sld 3, 3, 4
-; LE-64BIT-NEXT:    isel 9, 11, 10, 4
-; LE-64BIT-NEXT:    std 7, 24(5)
-; LE-64BIT-NEXT:    isel 0, 29, 6, 4
-; LE-64BIT-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    iseleq 4, 8, 9
-; LE-64BIT-NEXT:    std 0, 8(5)
-; LE-64BIT-NEXT:    isel 3, 3, 6, 4
-; LE-64BIT-NEXT:    std 4, 16(5)
-; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    xxlxor 1, 1, 1
+; LE-64BIT-NEXT:    lxvd2x 2, 0, 3
+; LE-64BIT-NEXT:    li 7, 48
+; LE-64BIT-NEXT:    addi 8, 1, -32
+; LE-64BIT-NEXT:    lxvd2x 0, 3, 6
+; LE-64BIT-NEXT:    addi 3, 1, -64
+; LE-64BIT-NEXT:    clrlwi 4, 4, 27
+; LE-64BIT-NEXT:    stxvd2x 1, 3, 6
+; LE-64BIT-NEXT:    neg 4, 4
+; LE-64BIT-NEXT:    stxvd2x 0, 3, 7
+; LE-64BIT-NEXT:    li 7, 32
+; LE-64BIT-NEXT:    extsw 4, 4
+; LE-64BIT-NEXT:    stxvd2x 2, 3, 7
+; LE-64BIT-NEXT:    stxvd2x 1, 0, 3
+; LE-64BIT-NEXT:    add 3, 8, 4
+; LE-64BIT-NEXT:    lxvd2x 0, 8, 4
+; LE-64BIT-NEXT:    lxvd2x 1, 3, 6
+; LE-64BIT-NEXT:    stxvd2x 1, 5, 6
+; LE-64BIT-NEXT:    stxvd2x 0, 0, 5
 ; LE-64BIT-NEXT:    blr
 ;
 ; BE-LABEL: shl_32bytes:
 ; BE:       # %bb.0:
-; BE-NEXT:    lwz 4, 28(4)
+; BE-NEXT:    ld 6, 0(3)
 ; BE-NEXT:    ld 7, 8(3)
-; BE-NEXT:    ld 8, 0(3)
-; BE-NEXT:    ld 9, 16(3)
+; BE-NEXT:    ld 8, 16(3)
 ; BE-NEXT:    ld 3, 24(3)
-; BE-NEXT:    std 27, -40(1) # 8-byte Folded Spill
-; BE-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; BE-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; BE-NEXT:    std 29, -24(1) # 8-byte Folded Spill
-; BE-NEXT:    li 6, 0
-; BE-NEXT:    rlwinm. 4, 4, 3, 0, 28
-; BE-NEXT:    subfic 10, 4, 192
-; BE-NEXT:    addi 11, 4, -128
-; BE-NEXT:    addi 12, 4, -192
-; BE-NEXT:    subfic 30, 4, 64
-; BE-NEXT:    srd 10, 3, 10
-; BE-NEXT:    sld 27, 9, 11
-; BE-NEXT:    sld 0, 8, 4
-; BE-NEXT:    addi 29, 4, -64
-; BE-NEXT:    subfic 28, 4, 128
-; BE-NEXT:    sld 12, 3, 12
-; BE-NEXT:    or 10, 27, 10
-; BE-NEXT:    srd 27, 7, 30
-; BE-NEXT:    or 10, 10, 12
-; BE-NEXT:    or 0, 0, 27
-; BE-NEXT:    sld 27, 7, 29
-; BE-NEXT:    subfic 12, 28, 64
-; BE-NEXT:    or 0, 0, 27
-; BE-NEXT:    srd 27, 3, 28
-; BE-NEXT:    sld 12, 9, 12
-; BE-NEXT:    srd 28, 9, 28
-; BE-NEXT:    cmplwi 1, 4, 128
-; BE-NEXT:    or 12, 27, 12
-; BE-NEXT:    or 28, 0, 28
-; BE-NEXT:    srd 0, 9, 30
-; BE-NEXT:    sld 9, 9, 4
-; BE-NEXT:    sld 11, 3, 11
-; BE-NEXT:    bc 12, 4, .LBB10_1
-; BE-NEXT:    b .LBB10_2
-; BE-NEXT:  .LBB10_1:
-; BE-NEXT:    addi 10, 28, 0
-; BE-NEXT:  .LBB10_2:
-; BE-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
-; BE-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
-; BE-NEXT:    or 12, 12, 0
-; BE-NEXT:    sld 0, 7, 4
-; BE-NEXT:    or 12, 0, 12
-; BE-NEXT:    srd 0, 3, 30
-; BE-NEXT:    sld 30, 3, 29
-; BE-NEXT:    bc 12, 4, .LBB10_3
-; BE-NEXT:    b .LBB10_4
-; BE-NEXT:  .LBB10_3:
-; BE-NEXT:    addi 11, 12, 0
-; BE-NEXT:  .LBB10_4:
-; BE-NEXT:    sld 3, 3, 4
-; BE-NEXT:    bc 12, 2, .LBB10_6
-; BE-NEXT:  # %bb.5:
-; BE-NEXT:    ori 4, 10, 0
-; BE-NEXT:    b .LBB10_7
-; BE-NEXT:  .LBB10_6:
-; BE-NEXT:    addi 4, 8, 0
-; BE-NEXT:  .LBB10_7:
-; BE-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
-; BE-NEXT:    or 9, 9, 0
-; BE-NEXT:    or 9, 9, 30
-; BE-NEXT:    bc 12, 2, .LBB10_9
-; BE-NEXT:  # %bb.8:
-; BE-NEXT:    ori 7, 11, 0
-; BE-NEXT:    b .LBB10_9
-; BE-NEXT:  .LBB10_9:
-; BE-NEXT:    bc 12, 4, .LBB10_11
-; BE-NEXT:  # %bb.10:
-; BE-NEXT:    ori 8, 6, 0
-; BE-NEXT:    ori 3, 6, 0
-; BE-NEXT:    b .LBB10_12
-; BE-NEXT:  .LBB10_11:
-; BE-NEXT:    addi 8, 9, 0
-; BE-NEXT:  .LBB10_12:
+; BE-NEXT:    lwz 4, 28(4)
+; BE-NEXT:    addi 9, 1, -64
+; BE-NEXT:    li 10, 0
+; BE-NEXT:    std 10, 56(9)
+; BE-NEXT:    std 10, 48(9)
+; BE-NEXT:    std 10, 40(9)
+; BE-NEXT:    std 10, 32(9)
+; BE-NEXT:    std 3, 24(9)
+; BE-NEXT:    std 8, 16(9)
+; BE-NEXT:    std 7, 8(9)
+; BE-NEXT:    std 6, -64(1)
+; BE-NEXT:    clrldi 3, 4, 59
+; BE-NEXT:    ldux 4, 3, 9
+; BE-NEXT:    ld 6, 8(3)
+; BE-NEXT:    ld 7, 24(3)
+; BE-NEXT:    ld 3, 16(3)
 ; BE-NEXT:    std 4, 0(5)
-; BE-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
-; BE-NEXT:    std 3, 24(5)
-; BE-NEXT:    std 8, 16(5)
-; BE-NEXT:    std 7, 8(5)
+; BE-NEXT:    std 3, 16(5)
+; BE-NEXT:    std 7, 24(5)
+; BE-NEXT:    std 6, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: shl_32bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -144(1)
-; LE-32BIT-NEXT:    mfcr 12
-; LE-32BIT-NEXT:    stw 14, 72(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 15, 76(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 16, 80(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 17, 84(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 18, 88(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 19, 92(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 20, 96(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 21, 100(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 22, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 23, 108(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 24, 112(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 25, 116(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 26, 120(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 27, 124(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 28, 128(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 29, 132(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 30, 136(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 31, 140(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 12, 68(1)
-; LE-32BIT-NEXT:    lwz 0, 28(4)
-; LE-32BIT-NEXT:    stw 5, 64(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    lwz 6, 24(3)
-; LE-32BIT-NEXT:    rlwinm. 30, 0, 3, 0, 28
-; LE-32BIT-NEXT:    lwz 5, 28(3)
-; LE-32BIT-NEXT:    subfic 21, 30, 224
-; LE-32BIT-NEXT:    lwz 7, 4(3)
-; LE-32BIT-NEXT:    subfic 0, 30, 160
-; LE-32BIT-NEXT:    lwz 9, 0(3)
-; LE-32BIT-NEXT:    addi 4, 30, -128
-; LE-32BIT-NEXT:    lwz 10, 8(3)
-; LE-32BIT-NEXT:    subfic 28, 30, 96
-; LE-32BIT-NEXT:    lwz 8, 12(3)
-; LE-32BIT-NEXT:    addi 29, 30, -64
-; LE-32BIT-NEXT:    lwz 12, 16(3)
-; LE-32BIT-NEXT:    subfic 25, 30, 32
-; LE-32BIT-NEXT:    lwz 11, 20(3)
-; LE-32BIT-NEXT:    addi 3, 30, -192
-; LE-32BIT-NEXT:    srw 21, 5, 21
-; LE-32BIT-NEXT:    slw 16, 6, 3
-; LE-32BIT-NEXT:    stw 3, 56(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    slw 20, 9, 30
-; LE-32BIT-NEXT:    srw 15, 11, 0
-; LE-32BIT-NEXT:    slw 14, 12, 4
-; LE-32BIT-NEXT:    srw 31, 8, 28
-; LE-32BIT-NEXT:    slw 3, 10, 29
-; LE-32BIT-NEXT:    or 21, 16, 21
-; LE-32BIT-NEXT:    srw 16, 7, 25
-; LE-32BIT-NEXT:    slw 19, 10, 30
-; LE-32BIT-NEXT:    or 15, 14, 15
-; LE-32BIT-NEXT:    srw 14, 8, 25
-; LE-32BIT-NEXT:    or 3, 3, 31
-; LE-32BIT-NEXT:    srw 31, 5, 0
-; LE-32BIT-NEXT:    or 20, 20, 16
-; LE-32BIT-NEXT:    slw 16, 6, 4
-; LE-32BIT-NEXT:    addi 27, 30, -224
-; LE-32BIT-NEXT:    or 19, 19, 14
-; LE-32BIT-NEXT:    srw 14, 5, 28
-; LE-32BIT-NEXT:    or 16, 16, 31
-; LE-32BIT-NEXT:    slw 31, 6, 29
-; LE-32BIT-NEXT:    addi 23, 30, -160
-; LE-32BIT-NEXT:    slw 18, 12, 30
-; LE-32BIT-NEXT:    stw 0, 40(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 14, 31, 14
-; LE-32BIT-NEXT:    srw 31, 11, 25
-; LE-32BIT-NEXT:    slw 0, 5, 27
-; LE-32BIT-NEXT:    addi 26, 30, -96
-; LE-32BIT-NEXT:    slw 17, 6, 30
-; LE-32BIT-NEXT:    or 18, 18, 31
-; LE-32BIT-NEXT:    srw 31, 5, 25
-; LE-32BIT-NEXT:    or 21, 21, 0
-; LE-32BIT-NEXT:    slw 0, 11, 23
-; LE-32BIT-NEXT:    or 17, 17, 31
-; LE-32BIT-NEXT:    addi 31, 30, -32
-; LE-32BIT-NEXT:    or 0, 15, 0
-; LE-32BIT-NEXT:    slw 15, 8, 26
-; LE-32BIT-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 29, 3, 15
-; LE-32BIT-NEXT:    slw 15, 7, 31
-; LE-32BIT-NEXT:    or 20, 20, 15
-; LE-32BIT-NEXT:    slw 15, 8, 31
-; LE-32BIT-NEXT:    or 3, 19, 15
-; LE-32BIT-NEXT:    subfic 15, 30, 128
-; LE-32BIT-NEXT:    slw 23, 5, 23
-; LE-32BIT-NEXT:    stw 3, 48(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 3, 16, 23
-; LE-32BIT-NEXT:    subfic 16, 15, 32
-; LE-32BIT-NEXT:    stw 3, 44(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 3, 11, 15
-; LE-32BIT-NEXT:    slw 22, 12, 16
-; LE-32BIT-NEXT:    or 23, 3, 22
-; LE-32BIT-NEXT:    subfic 22, 30, 64
-; LE-32BIT-NEXT:    stw 9, 60(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    mr 9, 10
-; LE-32BIT-NEXT:    subfic 3, 22, 32
-; LE-32BIT-NEXT:    stw 4, 36(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 4, 8, 22
-; LE-32BIT-NEXT:    slw 24, 9, 3
-; LE-32BIT-NEXT:    or 4, 4, 24
-; LE-32BIT-NEXT:    subfic 24, 30, 192
-; LE-32BIT-NEXT:    subfic 27, 24, 32
-; LE-32BIT-NEXT:    mr 10, 26
-; LE-32BIT-NEXT:    slw 27, 6, 27
-; LE-32BIT-NEXT:    srw 26, 5, 24
-; LE-32BIT-NEXT:    stw 28, 24(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 27, 26, 27
-; LE-32BIT-NEXT:    srw 26, 11, 22
-; LE-32BIT-NEXT:    slw 28, 12, 3
-; LE-32BIT-NEXT:    or 28, 26, 28
-; LE-32BIT-NEXT:    srw 26, 5, 15
-; LE-32BIT-NEXT:    slw 19, 6, 16
-; LE-32BIT-NEXT:    or 26, 26, 19
-; LE-32BIT-NEXT:    slw 19, 5, 10
-; LE-32BIT-NEXT:    stw 7, 32(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    mr 7, 9
-; LE-32BIT-NEXT:    or 19, 14, 19
-; LE-32BIT-NEXT:    slw 14, 11, 31
-; LE-32BIT-NEXT:    lwz 9, 64(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 18, 18, 14
-; LE-32BIT-NEXT:    slw 3, 6, 3
-; LE-32BIT-NEXT:    srw 14, 5, 22
-; LE-32BIT-NEXT:    cmplwi 5, 30, 64
-; LE-32BIT-NEXT:    cmplwi 1, 30, 128
-; LE-32BIT-NEXT:    srw 24, 6, 24
-; LE-32BIT-NEXT:    or 10, 14, 3
-; LE-32BIT-NEXT:    slw 14, 5, 31
-; LE-32BIT-NEXT:    crnand 21, 4, 20
-; LE-32BIT-NEXT:    slw 31, 5, 30
-; LE-32BIT-NEXT:    or 24, 0, 24
-; LE-32BIT-NEXT:    mr 3, 7
-; LE-32BIT-NEXT:    stw 7, 28(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 0, 7, 22
-; LE-32BIT-NEXT:    lwz 7, 24(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 17, 17, 14
-; LE-32BIT-NEXT:    bc 12, 21, .LBB10_2
-; LE-32BIT-NEXT:  # %bb.1:
-; LE-32BIT-NEXT:    ori 14, 31, 0
-; LE-32BIT-NEXT:    b .LBB10_3
-; LE-32BIT-NEXT:  .LBB10_2:
-; LE-32BIT-NEXT:    li 14, 0
-; LE-32BIT-NEXT:  .LBB10_3:
-; LE-32BIT-NEXT:    or 20, 20, 0
-; LE-32BIT-NEXT:    subfic 0, 15, 64
-; LE-32BIT-NEXT:    stw 14, 28(9)
-; LE-32BIT-NEXT:    subfic 14, 0, 32
-; LE-32BIT-NEXT:    srw 14, 11, 14
-; LE-32BIT-NEXT:    slw 31, 12, 0
-; LE-32BIT-NEXT:    or 14, 31, 14
-; LE-32BIT-NEXT:    srw 31, 12, 7
-; LE-32BIT-NEXT:    or 23, 23, 31
-; LE-32BIT-NEXT:    srw 31, 3, 25
-; LE-32BIT-NEXT:    lwz 3, 40(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 4, 4, 31
-; LE-32BIT-NEXT:    slw 0, 11, 0
-; LE-32BIT-NEXT:    cmplwi 2, 15, 64
-; LE-32BIT-NEXT:    srw 31, 6, 3
-; LE-32BIT-NEXT:    or 27, 27, 31
-; LE-32BIT-NEXT:    srw 31, 12, 25
-; LE-32BIT-NEXT:    or 28, 28, 31
-; LE-32BIT-NEXT:    srw 31, 6, 7
-; LE-32BIT-NEXT:    or 26, 26, 31
-; LE-32BIT-NEXT:    srw 31, 6, 22
-; LE-32BIT-NEXT:    or 18, 18, 31
-; LE-32BIT-NEXT:    lwz 31, 36(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    srw 25, 6, 25
-; LE-32BIT-NEXT:    or 3, 10, 25
-; LE-32BIT-NEXT:    or 26, 26, 0
-; LE-32BIT-NEXT:    cmplwi 6, 31, 64
-; LE-32BIT-NEXT:    slw 0, 11, 30
-; LE-32BIT-NEXT:    bc 12, 24, .LBB10_5
-; LE-32BIT-NEXT:  # %bb.4:
-; LE-32BIT-NEXT:    ori 25, 21, 0
-; LE-32BIT-NEXT:    b .LBB10_6
-; LE-32BIT-NEXT:  .LBB10_5:
-; LE-32BIT-NEXT:    addi 25, 24, 0
-; LE-32BIT-NEXT:  .LBB10_6:
-; LE-32BIT-NEXT:    slw 24, 11, 16
-; LE-32BIT-NEXT:    lwz 10, 32(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 3, 0, 3
-; LE-32BIT-NEXT:    bc 12, 21, .LBB10_8
-; LE-32BIT-NEXT:  # %bb.7:
-; LE-32BIT-NEXT:    ori 0, 17, 0
-; LE-32BIT-NEXT:    b .LBB10_9
-; LE-32BIT-NEXT:  .LBB10_8:
-; LE-32BIT-NEXT:    li 0, 0
-; LE-32BIT-NEXT:  .LBB10_9:
-; LE-32BIT-NEXT:    or 24, 14, 24
-; LE-32BIT-NEXT:    stw 0, 24(9)
-; LE-32BIT-NEXT:    srw 0, 6, 15
-; LE-32BIT-NEXT:    or 24, 0, 24
-; LE-32BIT-NEXT:    lwz 0, 56(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    slw 21, 10, 30
-; LE-32BIT-NEXT:    bc 12, 20, .LBB10_11
-; LE-32BIT-NEXT:  # %bb.10:
-; LE-32BIT-NEXT:    ori 7, 29, 0
-; LE-32BIT-NEXT:    b .LBB10_12
-; LE-32BIT-NEXT:  .LBB10_11:
-; LE-32BIT-NEXT:    addi 7, 20, 0
-; LE-32BIT-NEXT:  .LBB10_12:
-; LE-32BIT-NEXT:    or 4, 21, 4
-; LE-32BIT-NEXT:    slw 21, 11, 31
-; LE-32BIT-NEXT:    srw 20, 12, 15
-; LE-32BIT-NEXT:    cmplwi 3, 15, 0
-; LE-32BIT-NEXT:    li 15, 0
-; LE-32BIT-NEXT:    or 27, 21, 27
-; LE-32BIT-NEXT:    bc 12, 20, .LBB10_14
-; LE-32BIT-NEXT:  # %bb.13:
-; LE-32BIT-NEXT:    ori 21, 19, 0
-; LE-32BIT-NEXT:    b .LBB10_15
-; LE-32BIT-NEXT:  .LBB10_14:
-; LE-32BIT-NEXT:    addi 21, 18, 0
-; LE-32BIT-NEXT:  .LBB10_15:
-; LE-32BIT-NEXT:    mr 16, 9
-; LE-32BIT-NEXT:    lwz 9, 52(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    slw 18, 5, 0
-; LE-32BIT-NEXT:    bc 12, 8, .LBB10_17
-; LE-32BIT-NEXT:  # %bb.16:
-; LE-32BIT-NEXT:    ori 0, 15, 0
-; LE-32BIT-NEXT:    b .LBB10_18
-; LE-32BIT-NEXT:  .LBB10_17:
-; LE-32BIT-NEXT:    addi 0, 20, 0
-; LE-32BIT-NEXT:  .LBB10_18:
-; LE-32BIT-NEXT:    lwz 20, 60(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    slw 30, 8, 30
-; LE-32BIT-NEXT:    slw 19, 8, 9
-; LE-32BIT-NEXT:    slw 17, 5, 9
-; LE-32BIT-NEXT:    bc 12, 2, .LBB10_20
-; LE-32BIT-NEXT:  # %bb.19:
-; LE-32BIT-NEXT:    ori 9, 7, 0
-; LE-32BIT-NEXT:    b .LBB10_21
-; LE-32BIT-NEXT:  .LBB10_20:
-; LE-32BIT-NEXT:    addi 9, 20, 0
-; LE-32BIT-NEXT:  .LBB10_21:
-; LE-32BIT-NEXT:    lwz 7, 48(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    slw 29, 5, 31
-; LE-32BIT-NEXT:    or 9, 9, 0
-; LE-32BIT-NEXT:    bc 12, 20, .LBB10_23
-; LE-32BIT-NEXT:  # %bb.22:
-; LE-32BIT-NEXT:    ori 0, 15, 0
-; LE-32BIT-NEXT:    b .LBB10_24
-; LE-32BIT-NEXT:  .LBB10_23:
-; LE-32BIT-NEXT:    addi 0, 30, 0
-; LE-32BIT-NEXT:  .LBB10_24:
-; LE-32BIT-NEXT:    bc 12, 24, .LBB10_26
-; LE-32BIT-NEXT:  # %bb.25:
-; LE-32BIT-NEXT:    ori 30, 15, 0
-; LE-32BIT-NEXT:    b .LBB10_27
-; LE-32BIT-NEXT:  .LBB10_26:
-; LE-32BIT-NEXT:    addi 30, 29, 0
-; LE-32BIT-NEXT:  .LBB10_27:
-; LE-32BIT-NEXT:    bc 12, 8, .LBB10_28
-; LE-32BIT-NEXT:    b .LBB10_29
-; LE-32BIT-NEXT:  .LBB10_28:
-; LE-32BIT-NEXT:    addi 28, 26, 0
-; LE-32BIT-NEXT:  .LBB10_29:
-; LE-32BIT-NEXT:    bc 12, 20, .LBB10_31
-; LE-32BIT-NEXT:  # %bb.30:
-; LE-32BIT-NEXT:    ori 3, 17, 0
-; LE-32BIT-NEXT:    b .LBB10_31
-; LE-32BIT-NEXT:  .LBB10_31:
-; LE-32BIT-NEXT:    srw 22, 12, 22
-; LE-32BIT-NEXT:    bc 12, 20, .LBB10_33
-; LE-32BIT-NEXT:  # %bb.32:
-; LE-32BIT-NEXT:    ori 29, 15, 0
-; LE-32BIT-NEXT:    b .LBB10_34
-; LE-32BIT-NEXT:  .LBB10_33:
-; LE-32BIT-NEXT:    addi 29, 7, 0
-; LE-32BIT-NEXT:  .LBB10_34:
-; LE-32BIT-NEXT:    lwz 7, 44(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 20, .LBB10_36
-; LE-32BIT-NEXT:  # %bb.35:
-; LE-32BIT-NEXT:    ori 4, 19, 0
-; LE-32BIT-NEXT:    b .LBB10_36
-; LE-32BIT-NEXT:  .LBB10_36:
-; LE-32BIT-NEXT:    bc 12, 14, .LBB10_38
-; LE-32BIT-NEXT:  # %bb.37:
-; LE-32BIT-NEXT:    ori 5, 28, 0
-; LE-32BIT-NEXT:    b .LBB10_38
-; LE-32BIT-NEXT:  .LBB10_38:
-; LE-32BIT-NEXT:    li 28, 0
-; LE-32BIT-NEXT:    bc 12, 2, .LBB10_39
-; LE-32BIT-NEXT:    b .LBB10_40
-; LE-32BIT-NEXT:  .LBB10_39:
-; LE-32BIT-NEXT:    addi 3, 11, 0
-; LE-32BIT-NEXT:  .LBB10_40:
-; LE-32BIT-NEXT:    cmplwi 7, 31, 0
-; LE-32BIT-NEXT:    bc 12, 24, .LBB10_42
-; LE-32BIT-NEXT:  # %bb.41:
-; LE-32BIT-NEXT:    ori 27, 18, 0
-; LE-32BIT-NEXT:    b .LBB10_42
-; LE-32BIT-NEXT:  .LBB10_42:
-; LE-32BIT-NEXT:    bc 12, 8, .LBB10_44
-; LE-32BIT-NEXT:  # %bb.43:
-; LE-32BIT-NEXT:    ori 26, 22, 0
-; LE-32BIT-NEXT:    b .LBB10_45
-; LE-32BIT-NEXT:  .LBB10_44:
-; LE-32BIT-NEXT:    addi 26, 24, 0
-; LE-32BIT-NEXT:  .LBB10_45:
-; LE-32BIT-NEXT:    bc 12, 2, .LBB10_46
-; LE-32BIT-NEXT:    b .LBB10_47
-; LE-32BIT-NEXT:  .LBB10_46:
-; LE-32BIT-NEXT:    addi 4, 10, 0
-; LE-32BIT-NEXT:  .LBB10_47:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB10_49
-; LE-32BIT-NEXT:  # %bb.48:
-; LE-32BIT-NEXT:    ori 3, 28, 0
-; LE-32BIT-NEXT:    b .LBB10_49
-; LE-32BIT-NEXT:  .LBB10_49:
-; LE-32BIT-NEXT:    bc 12, 30, .LBB10_50
-; LE-32BIT-NEXT:    b .LBB10_51
-; LE-32BIT-NEXT:  .LBB10_50:
-; LE-32BIT-NEXT:    addi 25, 12, 0
-; LE-32BIT-NEXT:  .LBB10_51:
-; LE-32BIT-NEXT:    or 5, 0, 5
-; LE-32BIT-NEXT:    bc 12, 24, .LBB10_53
-; LE-32BIT-NEXT:  # %bb.52:
-; LE-32BIT-NEXT:    ori 24, 15, 0
-; LE-32BIT-NEXT:    b .LBB10_54
-; LE-32BIT-NEXT:  .LBB10_53:
-; LE-32BIT-NEXT:    addi 24, 7, 0
-; LE-32BIT-NEXT:  .LBB10_54:
-; LE-32BIT-NEXT:    bc 12, 8, .LBB10_56
-; LE-32BIT-NEXT:  # %bb.55:
-; LE-32BIT-NEXT:    ori 7, 15, 0
-; LE-32BIT-NEXT:    b .LBB10_57
-; LE-32BIT-NEXT:  .LBB10_56:
-; LE-32BIT-NEXT:    addi 7, 23, 0
-; LE-32BIT-NEXT:  .LBB10_57:
-; LE-32BIT-NEXT:    bc 12, 30, .LBB10_58
-; LE-32BIT-NEXT:    b .LBB10_59
-; LE-32BIT-NEXT:  .LBB10_58:
-; LE-32BIT-NEXT:    addi 27, 11, 0
-; LE-32BIT-NEXT:  .LBB10_59:
-; LE-32BIT-NEXT:    stw 3, 20(16)
-; LE-32BIT-NEXT:    or 3, 4, 7
-; LE-32BIT-NEXT:    bc 12, 4, .LBB10_61
-; LE-32BIT-NEXT:  # %bb.60:
-; LE-32BIT-NEXT:    ori 3, 27, 0
-; LE-32BIT-NEXT:    ori 9, 25, 0
-; LE-32BIT-NEXT:    b .LBB10_61
-; LE-32BIT-NEXT:  .LBB10_61:
-; LE-32BIT-NEXT:    bc 12, 14, .LBB10_63
-; LE-32BIT-NEXT:  # %bb.62:
-; LE-32BIT-NEXT:    ori 6, 26, 0
-; LE-32BIT-NEXT:    b .LBB10_63
-; LE-32BIT-NEXT:  .LBB10_63:
-; LE-32BIT-NEXT:    bc 12, 2, .LBB10_65
-; LE-32BIT-NEXT:  # %bb.64:
-; LE-32BIT-NEXT:    ori 12, 21, 0
-; LE-32BIT-NEXT:    b .LBB10_65
-; LE-32BIT-NEXT:  .LBB10_65:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB10_67
-; LE-32BIT-NEXT:  # %bb.66:
-; LE-32BIT-NEXT:    ori 5, 30, 0
-; LE-32BIT-NEXT:    b .LBB10_67
-; LE-32BIT-NEXT:  .LBB10_67:
-; LE-32BIT-NEXT:    bc 12, 2, .LBB10_69
-; LE-32BIT-NEXT:  # %bb.68:
-; LE-32BIT-NEXT:    ori 4, 9, 0
-; LE-32BIT-NEXT:    b .LBB10_70
-; LE-32BIT-NEXT:  .LBB10_69:
-; LE-32BIT-NEXT:    addi 3, 10, 0
-; LE-32BIT-NEXT:    addi 4, 20, 0
-; LE-32BIT-NEXT:  .LBB10_70:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB10_72
-; LE-32BIT-NEXT:  # %bb.71:
-; LE-32BIT-NEXT:    ori 12, 15, 0
-; LE-32BIT-NEXT:    b .LBB10_72
-; LE-32BIT-NEXT:  .LBB10_72:
-; LE-32BIT-NEXT:    bc 12, 2, .LBB10_73
-; LE-32BIT-NEXT:    b .LBB10_74
-; LE-32BIT-NEXT:  .LBB10_73:
-; LE-32BIT-NEXT:    addi 5, 8, 0
-; LE-32BIT-NEXT:  .LBB10_74:
-; LE-32BIT-NEXT:    stw 3, 4(16)
-; LE-32BIT-NEXT:    lwz 3, 28(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    stw 4, 0(16)
-; LE-32BIT-NEXT:    or 4, 29, 6
-; LE-32BIT-NEXT:    bc 12, 4, .LBB10_76
-; LE-32BIT-NEXT:  # %bb.75:
-; LE-32BIT-NEXT:    ori 4, 24, 0
-; LE-32BIT-NEXT:    b .LBB10_76
-; LE-32BIT-NEXT:  .LBB10_76:
-; LE-32BIT-NEXT:    stw 12, 16(16)
-; LE-32BIT-NEXT:    bc 12, 2, .LBB10_78
-; LE-32BIT-NEXT:  # %bb.77:
-; LE-32BIT-NEXT:    ori 3, 4, 0
-; LE-32BIT-NEXT:    b .LBB10_78
-; LE-32BIT-NEXT:  .LBB10_78:
-; LE-32BIT-NEXT:    stw 5, 12(16)
-; LE-32BIT-NEXT:    stw 3, 8(16)
-; LE-32BIT-NEXT:    lwz 12, 68(1)
-; LE-32BIT-NEXT:    lwz 31, 140(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    mtcrf 32, 12 # cr2
-; LE-32BIT-NEXT:    mtcrf 16, 12 # cr3
-; LE-32BIT-NEXT:    lwz 30, 136(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 29, 132(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 28, 128(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 27, 124(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 26, 120(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 25, 116(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 24, 112(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 23, 108(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 22, 104(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 21, 100(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 20, 96(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 19, 92(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 18, 88(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 17, 84(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 16, 80(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 15, 76(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 14, 72(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    addi 1, 1, 144
+; LE-32BIT-NEXT:    stwu 1, -80(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    li 6, 0
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 10, 12(3)
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    lwz 12, 20(3)
+; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    lwz 4, 28(4)
+; LE-32BIT-NEXT:    stw 6, 76(1)
+; LE-32BIT-NEXT:    stw 6, 72(1)
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
+; LE-32BIT-NEXT:    stw 6, 68(1)
+; LE-32BIT-NEXT:    stw 6, 64(1)
+; LE-32BIT-NEXT:    stw 6, 60(1)
+; LE-32BIT-NEXT:    stw 6, 56(1)
+; LE-32BIT-NEXT:    stw 6, 52(1)
+; LE-32BIT-NEXT:    stw 6, 48(1)
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    addi 3, 1, 16
+; LE-32BIT-NEXT:    stw 0, 40(1)
+; LE-32BIT-NEXT:    stw 12, 36(1)
+; LE-32BIT-NEXT:    stw 11, 32(1)
+; LE-32BIT-NEXT:    stw 10, 28(1)
+; LE-32BIT-NEXT:    stw 9, 24(1)
+; LE-32BIT-NEXT:    stw 8, 20(1)
+; LE-32BIT-NEXT:    stw 7, 16(1)
+; LE-32BIT-NEXT:    lwzux 3, 4, 3
+; LE-32BIT-NEXT:    lwz 6, 4(4)
+; LE-32BIT-NEXT:    lwz 7, 12(4)
+; LE-32BIT-NEXT:    lwz 8, 8(4)
+; LE-32BIT-NEXT:    lwz 9, 20(4)
+; LE-32BIT-NEXT:    lwz 10, 16(4)
+; LE-32BIT-NEXT:    lwz 11, 28(4)
+; LE-32BIT-NEXT:    lwz 4, 24(4)
+; LE-32BIT-NEXT:    stw 3, 0(5)
+; LE-32BIT-NEXT:    stw 4, 24(5)
+; LE-32BIT-NEXT:    stw 11, 28(5)
+; LE-32BIT-NEXT:    stw 10, 16(5)
+; LE-32BIT-NEXT:    stw 9, 20(5)
+; LE-32BIT-NEXT:    stw 8, 8(5)
+; LE-32BIT-NEXT:    stw 7, 12(5)
+; LE-32BIT-NEXT:    stw 6, 4(5)
+; LE-32BIT-NEXT:    addi 1, 1, 80
 ; LE-32BIT-NEXT:    blr
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -1776,603 +639,108 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-LABEL: ashr_32bytes:
 ; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    ld 7, 16(3)
+; LE-64BIT-NEXT:    ld 8, 24(3)
+; LE-64BIT-NEXT:    lxvd2x 0, 0, 3
 ; LE-64BIT-NEXT:    lwz 4, 0(4)
-; LE-64BIT-NEXT:    ld 6, 24(3)
-; LE-64BIT-NEXT:    ld 8, 16(3)
-; LE-64BIT-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 29, -24(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 26, -48(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    rlwinm. 4, 4, 3, 0, 28
-; LE-64BIT-NEXT:    sradi 9, 6, 63
-; LE-64BIT-NEXT:    subfic 10, 4, 192
-; LE-64BIT-NEXT:    addi 11, 4, -128
-; LE-64BIT-NEXT:    addi 30, 4, -192
-; LE-64BIT-NEXT:    sld 10, 6, 10
-; LE-64BIT-NEXT:    srd 29, 8, 11
-; LE-64BIT-NEXT:    subfic 28, 4, 64
-; LE-64BIT-NEXT:    std 27, -40(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    ld 7, 0(3)
-; LE-64BIT-NEXT:    ld 3, 8(3)
-; LE-64BIT-NEXT:    srd 0, 8, 4
-; LE-64BIT-NEXT:    srad 27, 6, 30
-; LE-64BIT-NEXT:    or 10, 29, 10
-; LE-64BIT-NEXT:    std 25, -56(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    cmpwi 1, 30, 1
-; LE-64BIT-NEXT:    sld 26, 6, 28
-; LE-64BIT-NEXT:    addi 30, 4, -64
-; LE-64BIT-NEXT:    isel 10, 10, 27, 4
-; LE-64BIT-NEXT:    or 27, 0, 26
-; LE-64BIT-NEXT:    subfic 0, 4, 128
-; LE-64BIT-NEXT:    srd 12, 7, 4
-; LE-64BIT-NEXT:    sld 26, 3, 28
-; LE-64BIT-NEXT:    subfic 25, 0, 64
-; LE-64BIT-NEXT:    srad 29, 6, 30
-; LE-64BIT-NEXT:    cmpwi 1, 30, 1
-; LE-64BIT-NEXT:    or 12, 12, 26
-; LE-64BIT-NEXT:    srd 30, 3, 30
-; LE-64BIT-NEXT:    sld 28, 8, 28
-; LE-64BIT-NEXT:    srd 26, 8, 25
-; LE-64BIT-NEXT:    sld 8, 8, 0
-; LE-64BIT-NEXT:    or 12, 12, 30
-; LE-64BIT-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 25, -56(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    sld 0, 6, 0
-; LE-64BIT-NEXT:    isel 29, 27, 29, 4
-; LE-64BIT-NEXT:    or 8, 12, 8
-; LE-64BIT-NEXT:    or 0, 0, 26
-; LE-64BIT-NEXT:    cmplwi 1, 4, 128
-; LE-64BIT-NEXT:    ld 26, -48(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    srd 27, 3, 4
-; LE-64BIT-NEXT:    or 0, 0, 28
-; LE-64BIT-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    srad 11, 6, 11
-; LE-64BIT-NEXT:    isel 8, 8, 10, 4
-; LE-64BIT-NEXT:    or 10, 27, 0
-; LE-64BIT-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    iseleq 7, 7, 8
-; LE-64BIT-NEXT:    srad 4, 6, 4
-; LE-64BIT-NEXT:    isel 8, 10, 11, 4
-; LE-64BIT-NEXT:    std 7, 0(5)
-; LE-64BIT-NEXT:    isel 12, 29, 9, 4
-; LE-64BIT-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    iseleq 3, 3, 8
-; LE-64BIT-NEXT:    std 12, 16(5)
-; LE-64BIT-NEXT:    isel 4, 4, 9, 4
-; LE-64BIT-NEXT:    std 3, 8(5)
-; LE-64BIT-NEXT:    std 4, 24(5)
+; LE-64BIT-NEXT:    addi 6, 1, -64
+; LE-64BIT-NEXT:    sradi 3, 8, 63
+; LE-64BIT-NEXT:    clrldi 4, 4, 59
+; LE-64BIT-NEXT:    std 8, 24(6)
+; LE-64BIT-NEXT:    std 7, 16(6)
+; LE-64BIT-NEXT:    std 3, 56(6)
+; LE-64BIT-NEXT:    std 3, 48(6)
+; LE-64BIT-NEXT:    li 7, 16
+; LE-64BIT-NEXT:    std 3, 40(6)
+; LE-64BIT-NEXT:    std 3, 32(6)
+; LE-64BIT-NEXT:    add 3, 6, 4
+; LE-64BIT-NEXT:    stxvd2x 0, 0, 6
+; LE-64BIT-NEXT:    lxvd2x 0, 6, 4
+; LE-64BIT-NEXT:    lxvd2x 1, 3, 7
+; LE-64BIT-NEXT:    stxvd2x 1, 5, 7
+; LE-64BIT-NEXT:    stxvd2x 0, 0, 5
 ; LE-64BIT-NEXT:    blr
 ;
 ; BE-LABEL: ashr_32bytes:
 ; BE:       # %bb.0:
-; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    ld 6, 16(3)
-; BE-NEXT:    ld 7, 24(3)
+; BE-NEXT:    ld 7, 0(3)
 ; BE-NEXT:    ld 8, 8(3)
-; BE-NEXT:    ld 3, 0(3)
-; BE-NEXT:    std 27, -40(1) # 8-byte Folded Spill
-; BE-NEXT:    std 29, -24(1) # 8-byte Folded Spill
-; BE-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; BE-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; BE-NEXT:    rlwinm. 4, 4, 3, 0, 28
-; BE-NEXT:    subfic 9, 4, 192
-; BE-NEXT:    addi 10, 4, -128
-; BE-NEXT:    addi 11, 4, -192
-; BE-NEXT:    subfic 0, 4, 64
-; BE-NEXT:    sld 9, 3, 9
-; BE-NEXT:    srd 27, 8, 10
-; BE-NEXT:    srd 12, 7, 4
-; BE-NEXT:    subfic 29, 4, 128
-; BE-NEXT:    cmpwi 1, 11, 1
-; BE-NEXT:    srad 11, 3, 11
-; BE-NEXT:    or 9, 27, 9
-; BE-NEXT:    sld 27, 6, 0
-; BE-NEXT:    addi 30, 4, -64
-; BE-NEXT:    srd 28, 8, 4
-; BE-NEXT:    or 12, 12, 27
-; BE-NEXT:    sld 27, 3, 0
-; BE-NEXT:    bc 12, 4, .LBB11_2
-; BE-NEXT:  # %bb.1:
-; BE-NEXT:    ori 9, 11, 0
-; BE-NEXT:    b .LBB11_2
-; BE-NEXT:  .LBB11_2:
-; BE-NEXT:    subfic 11, 29, 64
-; BE-NEXT:    or 28, 28, 27
-; BE-NEXT:    srd 27, 6, 30
-; BE-NEXT:    sld 0, 8, 0
-; BE-NEXT:    srd 11, 8, 11
-; BE-NEXT:    sld 8, 8, 29
-; BE-NEXT:    sld 29, 3, 29
-; BE-NEXT:    cmplwi 1, 4, 128
-; BE-NEXT:    or 12, 12, 27
-; BE-NEXT:    or 11, 29, 11
-; BE-NEXT:    or 8, 12, 8
-; BE-NEXT:    srd 12, 6, 4
-; BE-NEXT:    or 11, 11, 0
-; BE-NEXT:    srad 10, 3, 10
-; BE-NEXT:    srad 29, 3, 30
-; BE-NEXT:    or 11, 12, 11
-; BE-NEXT:    cmpwi 5, 30, 1
-; BE-NEXT:    bc 12, 20, .LBB11_4
-; BE-NEXT:  # %bb.3:
-; BE-NEXT:    ori 12, 29, 0
-; BE-NEXT:    b .LBB11_5
-; BE-NEXT:  .LBB11_4:
-; BE-NEXT:    addi 12, 28, 0
-; BE-NEXT:  .LBB11_5:
-; BE-NEXT:    bc 12, 4, .LBB11_7
-; BE-NEXT:  # %bb.6:
-; BE-NEXT:    ori 8, 9, 0
-; BE-NEXT:    ori 9, 10, 0
-; BE-NEXT:    b .LBB11_8
-; BE-NEXT:  .LBB11_7:
-; BE-NEXT:    addi 9, 11, 0
-; BE-NEXT:  .LBB11_8:
-; BE-NEXT:    sradi 10, 3, 63
-; BE-NEXT:    srad 3, 3, 4
-; BE-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
-; BE-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
-; BE-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
-; BE-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
-; BE-NEXT:    bc 12, 2, .LBB11_10
-; BE-NEXT:  # %bb.9:
-; BE-NEXT:    ori 4, 8, 0
-; BE-NEXT:    ori 6, 9, 0
-; BE-NEXT:    b .LBB11_11
-; BE-NEXT:  .LBB11_10:
-; BE-NEXT:    addi 4, 7, 0
-; BE-NEXT:  .LBB11_11:
-; BE-NEXT:    bc 12, 4, .LBB11_13
-; BE-NEXT:  # %bb.12:
-; BE-NEXT:    ori 7, 10, 0
-; BE-NEXT:    ori 3, 10, 0
-; BE-NEXT:    b .LBB11_14
-; BE-NEXT:  .LBB11_13:
-; BE-NEXT:    addi 7, 12, 0
-; BE-NEXT:  .LBB11_14:
+; BE-NEXT:    ld 9, 16(3)
+; BE-NEXT:    ld 3, 24(3)
+; BE-NEXT:    lwz 4, 28(4)
+; BE-NEXT:    addi 6, 1, -64
+; BE-NEXT:    std 3, 56(6)
+; BE-NEXT:    sradi 3, 7, 63
+; BE-NEXT:    clrlwi 4, 4, 27
+; BE-NEXT:    std 3, 24(6)
+; BE-NEXT:    std 3, 16(6)
+; BE-NEXT:    std 3, 8(6)
+; BE-NEXT:    std 3, -64(1)
+; BE-NEXT:    neg 3, 4
+; BE-NEXT:    std 9, 48(6)
+; BE-NEXT:    std 8, 40(6)
+; BE-NEXT:    std 7, 32(6)
+; BE-NEXT:    extsw 3, 3
+; BE-NEXT:    addi 4, 1, -32
+; BE-NEXT:    ldux 3, 4, 3
+; BE-NEXT:    ld 6, 8(4)
+; BE-NEXT:    ld 7, 24(4)
+; BE-NEXT:    ld 4, 16(4)
 ; BE-NEXT:    std 3, 0(5)
-; BE-NEXT:    std 7, 8(5)
-; BE-NEXT:    std 4, 24(5)
-; BE-NEXT:    std 6, 16(5)
+; BE-NEXT:    std 4, 16(5)
+; BE-NEXT:    std 7, 24(5)
+; BE-NEXT:    std 6, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: ashr_32bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -144(1)
-; LE-32BIT-NEXT:    mfcr 12
-; LE-32BIT-NEXT:    stw 14, 72(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 15, 76(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 16, 80(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 17, 84(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 18, 88(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 19, 92(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 20, 96(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 21, 100(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 22, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 23, 108(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 24, 112(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 25, 116(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 26, 120(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 27, 124(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 28, 128(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 29, 132(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 30, 136(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 31, 140(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stwu 1, -80(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    addi 6, 1, 48
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 10, 12(3)
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    lwz 12, 20(3)
+; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    lwz 4, 28(4)
+; LE-32BIT-NEXT:    stw 3, 76(1)
+; LE-32BIT-NEXT:    srawi 3, 7, 31
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
+; LE-32BIT-NEXT:    stw 0, 72(1)
 ; LE-32BIT-NEXT:    stw 12, 68(1)
-; LE-32BIT-NEXT:    lwz 0, 28(4)
-; LE-32BIT-NEXT:    lwz 29, 4(3)
-; LE-32BIT-NEXT:    lwz 12, 0(3)
-; LE-32BIT-NEXT:    rlwinm. 30, 0, 3, 0, 28
-; LE-32BIT-NEXT:    stw 5, 64(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 23, 30, 224
-; LE-32BIT-NEXT:    lwz 5, 24(3)
-; LE-32BIT-NEXT:    addi 21, 30, -224
-; LE-32BIT-NEXT:    lwz 8, 28(3)
-; LE-32BIT-NEXT:    subfic 4, 30, 160
-; LE-32BIT-NEXT:    lwz 10, 20(3)
-; LE-32BIT-NEXT:    addi 11, 30, -128
-; LE-32BIT-NEXT:    lwz 9, 16(3)
-; LE-32BIT-NEXT:    subfic 25, 30, 96
-; LE-32BIT-NEXT:    lwz 26, 12(3)
-; LE-32BIT-NEXT:    addi 0, 30, -64
-; LE-32BIT-NEXT:    lwz 7, 8(3)
-; LE-32BIT-NEXT:    addi 3, 30, -192
-; LE-32BIT-NEXT:    subfic 27, 30, 32
-; LE-32BIT-NEXT:    slw 23, 12, 23
-; LE-32BIT-NEXT:    srw 16, 29, 3
-; LE-32BIT-NEXT:    stw 3, 56(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 20, 8, 30
-; LE-32BIT-NEXT:    sraw 15, 12, 21
-; LE-32BIT-NEXT:    cmpwi 1, 21, 1
-; LE-32BIT-NEXT:    slw 21, 7, 4
-; LE-32BIT-NEXT:    srw 14, 26, 11
-; LE-32BIT-NEXT:    slw 31, 9, 25
-; LE-32BIT-NEXT:    srw 3, 10, 0
-; LE-32BIT-NEXT:    or 23, 16, 23
-; LE-32BIT-NEXT:    slw 16, 5, 27
-; LE-32BIT-NEXT:    srw 19, 10, 30
-; LE-32BIT-NEXT:    or 21, 14, 21
-; LE-32BIT-NEXT:    slw 14, 9, 27
-; LE-32BIT-NEXT:    or 3, 3, 31
-; LE-32BIT-NEXT:    slw 31, 12, 4
-; LE-32BIT-NEXT:    or 20, 20, 16
-; LE-32BIT-NEXT:    srw 16, 29, 11
-; LE-32BIT-NEXT:    or 19, 19, 14
-; LE-32BIT-NEXT:    slw 14, 12, 25
-; LE-32BIT-NEXT:    or 16, 16, 31
-; LE-32BIT-NEXT:    srw 31, 29, 0
-; LE-32BIT-NEXT:    addi 24, 30, -160
-; LE-32BIT-NEXT:    srw 18, 26, 30
-; LE-32BIT-NEXT:    or 14, 31, 14
-; LE-32BIT-NEXT:    slw 31, 7, 27
-; LE-32BIT-NEXT:    addi 28, 30, -96
-; LE-32BIT-NEXT:    srw 17, 29, 30
-; LE-32BIT-NEXT:    stw 4, 32(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 18, 18, 31
-; LE-32BIT-NEXT:    slw 31, 12, 27
-; LE-32BIT-NEXT:    bc 12, 4, .LBB11_2
-; LE-32BIT-NEXT:  # %bb.1:
-; LE-32BIT-NEXT:    ori 4, 15, 0
-; LE-32BIT-NEXT:    b .LBB11_3
-; LE-32BIT-NEXT:  .LBB11_2:
-; LE-32BIT-NEXT:    addi 4, 23, 0
-; LE-32BIT-NEXT:  .LBB11_3:
-; LE-32BIT-NEXT:    srw 15, 7, 24
-; LE-32BIT-NEXT:    or 17, 17, 31
-; LE-32BIT-NEXT:    addi 31, 30, -32
-; LE-32BIT-NEXT:    or 21, 21, 15
-; LE-32BIT-NEXT:    srw 15, 9, 28
-; LE-32BIT-NEXT:    or 3, 3, 15
-; LE-32BIT-NEXT:    srw 15, 5, 31
-; LE-32BIT-NEXT:    or 20, 20, 15
-; LE-32BIT-NEXT:    srw 15, 9, 31
-; LE-32BIT-NEXT:    stw 3, 28(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 3, 19, 15
-; LE-32BIT-NEXT:    subfic 15, 30, 64
-; LE-32BIT-NEXT:    stw 4, 24(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    cmpwi 1, 24, 1
-; LE-32BIT-NEXT:    sraw 24, 12, 24
-; LE-32BIT-NEXT:    subfic 4, 15, 32
-; LE-32BIT-NEXT:    stw 0, 52(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 0, 26, 4
-; LE-32BIT-NEXT:    stw 3, 48(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    bc 12, 4, .LBB11_5
-; LE-32BIT-NEXT:  # %bb.4:
-; LE-32BIT-NEXT:    ori 3, 24, 0
-; LE-32BIT-NEXT:    b .LBB11_6
-; LE-32BIT-NEXT:  .LBB11_5:
-; LE-32BIT-NEXT:    addi 3, 16, 0
-; LE-32BIT-NEXT:  .LBB11_6:
-; LE-32BIT-NEXT:    slw 16, 7, 15
-; LE-32BIT-NEXT:    or 0, 16, 0
-; LE-32BIT-NEXT:    subfic 16, 30, 128
-; LE-32BIT-NEXT:    stw 5, 36(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 5, 16, 32
-; LE-32BIT-NEXT:    stw 3, 44(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    slw 3, 12, 16
-; LE-32BIT-NEXT:    srw 22, 29, 5
-; LE-32BIT-NEXT:    stw 8, 60(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    mr 8, 10
-; LE-32BIT-NEXT:    mr 10, 27
-; LE-32BIT-NEXT:    or 23, 3, 22
-; LE-32BIT-NEXT:    slw 22, 7, 16
-; LE-32BIT-NEXT:    srw 27, 26, 5
-; LE-32BIT-NEXT:    stw 11, 40(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    mr 6, 26
-; LE-32BIT-NEXT:    or 11, 22, 27
-; LE-32BIT-NEXT:    slw 22, 9, 15
-; LE-32BIT-NEXT:    srw 26, 8, 4
-; LE-32BIT-NEXT:    subfic 3, 30, 192
-; LE-32BIT-NEXT:    or 26, 22, 26
-; LE-32BIT-NEXT:    cmpwi 1, 28, 1
-; LE-32BIT-NEXT:    sraw 22, 12, 28
-; LE-32BIT-NEXT:    subfic 19, 3, 32
-; LE-32BIT-NEXT:    srw 4, 29, 4
-; LE-32BIT-NEXT:    slw 28, 12, 15
-; LE-32BIT-NEXT:    stw 9, 20(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 19, 29, 19
-; LE-32BIT-NEXT:    slw 24, 12, 3
-; LE-32BIT-NEXT:    or 9, 28, 4
-; LE-32BIT-NEXT:    lwz 4, 64(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 24, 24, 19
-; LE-32BIT-NEXT:    bc 12, 4, .LBB11_7
-; LE-32BIT-NEXT:    b .LBB11_8
-; LE-32BIT-NEXT:  .LBB11_7:
-; LE-32BIT-NEXT:    addi 22, 14, 0
-; LE-32BIT-NEXT:  .LBB11_8:
-; LE-32BIT-NEXT:    srw 19, 7, 31
-; LE-32BIT-NEXT:    cmplwi 5, 30, 64
-; LE-32BIT-NEXT:    cmplwi 1, 30, 128
-; LE-32BIT-NEXT:    slw 3, 29, 3
-; LE-32BIT-NEXT:    or 19, 18, 19
-; LE-32BIT-NEXT:    cmpwi 6, 31, 1
-; LE-32BIT-NEXT:    sraw 18, 12, 31
-; LE-32BIT-NEXT:    crand 21, 4, 20
-; LE-32BIT-NEXT:    srawi 14, 12, 31
-; LE-32BIT-NEXT:    sraw 31, 12, 30
-; LE-32BIT-NEXT:    or 3, 21, 3
-; LE-32BIT-NEXT:    slw 21, 8, 15
-; LE-32BIT-NEXT:    bc 12, 24, .LBB11_10
-; LE-32BIT-NEXT:  # %bb.9:
-; LE-32BIT-NEXT:    ori 28, 18, 0
-; LE-32BIT-NEXT:    b .LBB11_11
-; LE-32BIT-NEXT:  .LBB11_10:
-; LE-32BIT-NEXT:    addi 28, 17, 0
-; LE-32BIT-NEXT:  .LBB11_11:
-; LE-32BIT-NEXT:    bc 12, 21, .LBB11_13
-; LE-32BIT-NEXT:  # %bb.12:
-; LE-32BIT-NEXT:    ori 18, 14, 0
-; LE-32BIT-NEXT:    b .LBB11_14
-; LE-32BIT-NEXT:  .LBB11_13:
-; LE-32BIT-NEXT:    addi 18, 31, 0
-; LE-32BIT-NEXT:  .LBB11_14:
-; LE-32BIT-NEXT:    or 21, 20, 21
-; LE-32BIT-NEXT:    subfic 20, 16, 64
-; LE-32BIT-NEXT:    stw 18, 0(4)
-; LE-32BIT-NEXT:    subfic 18, 20, 32
-; LE-32BIT-NEXT:    slw 18, 7, 18
-; LE-32BIT-NEXT:    srw 17, 6, 20
-; LE-32BIT-NEXT:    or 18, 17, 18
-; LE-32BIT-NEXT:    slw 17, 6, 10
-; LE-32BIT-NEXT:    or 27, 0, 17
-; LE-32BIT-NEXT:    slw 0, 29, 25
-; LE-32BIT-NEXT:    mr 31, 8
-; LE-32BIT-NEXT:    or 8, 23, 0
-; LE-32BIT-NEXT:    slw 0, 6, 25
-; LE-32BIT-NEXT:    or 11, 11, 0
-; LE-32BIT-NEXT:    stw 11, 16(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    slw 0, 31, 10
-; LE-32BIT-NEXT:    lwz 11, 32(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 0, 26, 0
-; LE-32BIT-NEXT:    slw 25, 29, 10
-; LE-32BIT-NEXT:    or 23, 9, 25
-; LE-32BIT-NEXT:    slw 26, 29, 11
-; LE-32BIT-NEXT:    or 26, 24, 26
-; LE-32BIT-NEXT:    slw 24, 29, 15
-; LE-32BIT-NEXT:    or 24, 19, 24
-; LE-32BIT-NEXT:    lwz 19, 40(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    srw 25, 7, 20
-; LE-32BIT-NEXT:    lwz 9, 24(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 8, 8, 25
-; LE-32BIT-NEXT:    cmplwi 6, 19, 64
-; LE-32BIT-NEXT:    srw 5, 7, 5
-; LE-32BIT-NEXT:    bc 12, 24, .LBB11_16
-; LE-32BIT-NEXT:  # %bb.15:
-; LE-32BIT-NEXT:    ori 3, 9, 0
-; LE-32BIT-NEXT:    b .LBB11_16
-; LE-32BIT-NEXT:  .LBB11_16:
-; LE-32BIT-NEXT:    lwz 9, 28(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 5, 18, 5
-; LE-32BIT-NEXT:    lwz 17, 20(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    mr 18, 4
-; LE-32BIT-NEXT:    bc 12, 20, .LBB11_18
-; LE-32BIT-NEXT:  # %bb.17:
-; LE-32BIT-NEXT:    ori 10, 9, 0
-; LE-32BIT-NEXT:    b .LBB11_19
-; LE-32BIT-NEXT:  .LBB11_18:
-; LE-32BIT-NEXT:    addi 10, 21, 0
-; LE-32BIT-NEXT:  .LBB11_19:
-; LE-32BIT-NEXT:    lwz 9, 36(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 20, .LBB11_21
-; LE-32BIT-NEXT:  # %bb.20:
-; LE-32BIT-NEXT:    ori 24, 22, 0
-; LE-32BIT-NEXT:    b .LBB11_21
-; LE-32BIT-NEXT:  .LBB11_21:
-; LE-32BIT-NEXT:    cmplwi 7, 19, 0
-; LE-32BIT-NEXT:    cmplwi 2, 16, 64
-; LE-32BIT-NEXT:    bc 12, 30, .LBB11_22
-; LE-32BIT-NEXT:    b .LBB11_23
-; LE-32BIT-NEXT:  .LBB11_22:
-; LE-32BIT-NEXT:    addi 3, 6, 0
-; LE-32BIT-NEXT:  .LBB11_23:
-; LE-32BIT-NEXT:    cmplwi 3, 16, 0
-; LE-32BIT-NEXT:    srw 25, 9, 30
-; LE-32BIT-NEXT:    or 25, 25, 0
-; LE-32BIT-NEXT:    srw 0, 7, 19
-; LE-32BIT-NEXT:    or 26, 0, 26
-; LE-32BIT-NEXT:    srw 0, 7, 30
-; LE-32BIT-NEXT:    or 11, 0, 23
-; LE-32BIT-NEXT:    bc 12, 21, .LBB11_25
-; LE-32BIT-NEXT:  # %bb.24:
-; LE-32BIT-NEXT:    ori 0, 14, 0
-; LE-32BIT-NEXT:    b .LBB11_26
-; LE-32BIT-NEXT:  .LBB11_25:
-; LE-32BIT-NEXT:    addi 0, 28, 0
-; LE-32BIT-NEXT:  .LBB11_26:
-; LE-32BIT-NEXT:    slw 28, 6, 16
-; LE-32BIT-NEXT:    stw 0, 4(4)
-; LE-32BIT-NEXT:    slw 0, 29, 16
-; LE-32BIT-NEXT:    lwz 4, 52(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 5, 0, 5
-; LE-32BIT-NEXT:    lwz 0, 56(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    slw 23, 6, 15
-; LE-32BIT-NEXT:    srw 22, 17, 4
-; LE-32BIT-NEXT:    li 15, 0
-; LE-32BIT-NEXT:    sraw 21, 12, 0
-; LE-32BIT-NEXT:    bc 12, 8, .LBB11_28
-; LE-32BIT-NEXT:  # %bb.27:
-; LE-32BIT-NEXT:    ori 0, 15, 0
-; LE-32BIT-NEXT:    b .LBB11_29
-; LE-32BIT-NEXT:  .LBB11_28:
-; LE-32BIT-NEXT:    addi 0, 28, 0
-; LE-32BIT-NEXT:  .LBB11_29:
-; LE-32BIT-NEXT:    bc 12, 20, .LBB11_31
-; LE-32BIT-NEXT:  # %bb.30:
-; LE-32BIT-NEXT:    ori 28, 22, 0
-; LE-32BIT-NEXT:    b .LBB11_32
-; LE-32BIT-NEXT:  .LBB11_31:
-; LE-32BIT-NEXT:    addi 28, 25, 0
-; LE-32BIT-NEXT:  .LBB11_32:
-; LE-32BIT-NEXT:    bc 12, 2, .LBB11_34
-; LE-32BIT-NEXT:  # %bb.33:
-; LE-32BIT-NEXT:    ori 22, 24, 0
-; LE-32BIT-NEXT:    b .LBB11_35
-; LE-32BIT-NEXT:  .LBB11_34:
-; LE-32BIT-NEXT:    addi 22, 6, 0
-; LE-32BIT-NEXT:  .LBB11_35:
-; LE-32BIT-NEXT:    lwz 6, 48(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    sraw 20, 12, 4
-; LE-32BIT-NEXT:    lwz 16, 60(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 20, .LBB11_37
-; LE-32BIT-NEXT:  # %bb.36:
-; LE-32BIT-NEXT:    ori 4, 20, 0
-; LE-32BIT-NEXT:    b .LBB11_38
-; LE-32BIT-NEXT:  .LBB11_37:
-; LE-32BIT-NEXT:    addi 4, 11, 0
-; LE-32BIT-NEXT:  .LBB11_38:
-; LE-32BIT-NEXT:    srw 30, 17, 30
-; LE-32BIT-NEXT:    bc 12, 20, .LBB11_40
-; LE-32BIT-NEXT:  # %bb.39:
-; LE-32BIT-NEXT:    ori 25, 15, 0
-; LE-32BIT-NEXT:    b .LBB11_41
-; LE-32BIT-NEXT:  .LBB11_40:
-; LE-32BIT-NEXT:    addi 25, 6, 0
-; LE-32BIT-NEXT:  .LBB11_41:
-; LE-32BIT-NEXT:    lwz 6, 44(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 8, .LBB11_43
-; LE-32BIT-NEXT:  # %bb.42:
-; LE-32BIT-NEXT:    ori 8, 27, 0
-; LE-32BIT-NEXT:    ori 5, 23, 0
-; LE-32BIT-NEXT:    b .LBB11_43
-; LE-32BIT-NEXT:  .LBB11_43:
-; LE-32BIT-NEXT:    bc 12, 2, .LBB11_44
-; LE-32BIT-NEXT:    b .LBB11_45
-; LE-32BIT-NEXT:  .LBB11_44:
-; LE-32BIT-NEXT:    addi 4, 7, 0
-; LE-32BIT-NEXT:  .LBB11_45:
-; LE-32BIT-NEXT:    sraw 19, 12, 19
-; LE-32BIT-NEXT:    bc 12, 2, .LBB11_46
-; LE-32BIT-NEXT:    b .LBB11_47
-; LE-32BIT-NEXT:  .LBB11_46:
-; LE-32BIT-NEXT:    addi 10, 16, 0
-; LE-32BIT-NEXT:  .LBB11_47:
-; LE-32BIT-NEXT:    bc 12, 24, .LBB11_49
-; LE-32BIT-NEXT:  # %bb.48:
-; LE-32BIT-NEXT:    ori 26, 21, 0
-; LE-32BIT-NEXT:    b .LBB11_49
-; LE-32BIT-NEXT:  .LBB11_49:
-; LE-32BIT-NEXT:    bc 12, 14, .LBB11_50
-; LE-32BIT-NEXT:    b .LBB11_51
-; LE-32BIT-NEXT:  .LBB11_50:
-; LE-32BIT-NEXT:    addi 5, 29, 0
-; LE-32BIT-NEXT:  .LBB11_51:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB11_53
-; LE-32BIT-NEXT:  # %bb.52:
-; LE-32BIT-NEXT:    ori 4, 14, 0
-; LE-32BIT-NEXT:    b .LBB11_53
-; LE-32BIT-NEXT:  .LBB11_53:
-; LE-32BIT-NEXT:    or 10, 10, 0
-; LE-32BIT-NEXT:    bc 12, 24, .LBB11_55
-; LE-32BIT-NEXT:  # %bb.54:
-; LE-32BIT-NEXT:    ori 24, 14, 0
-; LE-32BIT-NEXT:    b .LBB11_56
-; LE-32BIT-NEXT:  .LBB11_55:
-; LE-32BIT-NEXT:    addi 24, 6, 0
-; LE-32BIT-NEXT:  .LBB11_56:
-; LE-32BIT-NEXT:    lwz 6, 16(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 4, .LBB11_57
-; LE-32BIT-NEXT:    b .LBB11_58
-; LE-32BIT-NEXT:  .LBB11_57:
-; LE-32BIT-NEXT:    addi 3, 10, 0
-; LE-32BIT-NEXT:  .LBB11_58:
-; LE-32BIT-NEXT:    bc 12, 20, .LBB11_60
-; LE-32BIT-NEXT:  # %bb.59:
-; LE-32BIT-NEXT:    ori 0, 15, 0
-; LE-32BIT-NEXT:    b .LBB11_61
-; LE-32BIT-NEXT:  .LBB11_60:
-; LE-32BIT-NEXT:    addi 0, 30, 0
-; LE-32BIT-NEXT:  .LBB11_61:
-; LE-32BIT-NEXT:    bc 12, 24, .LBB11_63
-; LE-32BIT-NEXT:  # %bb.62:
-; LE-32BIT-NEXT:    ori 30, 14, 0
-; LE-32BIT-NEXT:    b .LBB11_64
-; LE-32BIT-NEXT:  .LBB11_63:
-; LE-32BIT-NEXT:    addi 30, 19, 0
-; LE-32BIT-NEXT:  .LBB11_64:
-; LE-32BIT-NEXT:    bc 12, 2, .LBB11_65
-; LE-32BIT-NEXT:    b .LBB11_66
-; LE-32BIT-NEXT:  .LBB11_65:
-; LE-32BIT-NEXT:    addi 3, 16, 0
-; LE-32BIT-NEXT:  .LBB11_66:
-; LE-32BIT-NEXT:    stw 4, 8(18)
-; LE-32BIT-NEXT:    bc 12, 8, .LBB11_68
-; LE-32BIT-NEXT:  # %bb.67:
-; LE-32BIT-NEXT:    ori 27, 15, 0
-; LE-32BIT-NEXT:    b .LBB11_69
-; LE-32BIT-NEXT:  .LBB11_68:
-; LE-32BIT-NEXT:    addi 27, 6, 0
-; LE-32BIT-NEXT:  .LBB11_69:
-; LE-32BIT-NEXT:    bc 12, 14, .LBB11_71
-; LE-32BIT-NEXT:  # %bb.70:
-; LE-32BIT-NEXT:    ori 6, 8, 0
-; LE-32BIT-NEXT:    b .LBB11_72
-; LE-32BIT-NEXT:  .LBB11_71:
-; LE-32BIT-NEXT:    addi 6, 12, 0
-; LE-32BIT-NEXT:  .LBB11_72:
-; LE-32BIT-NEXT:    bc 12, 2, .LBB11_74
-; LE-32BIT-NEXT:  # %bb.73:
-; LE-32BIT-NEXT:    ori 8, 28, 0
-; LE-32BIT-NEXT:    b .LBB11_75
-; LE-32BIT-NEXT:  .LBB11_74:
-; LE-32BIT-NEXT:    addi 8, 9, 0
-; LE-32BIT-NEXT:  .LBB11_75:
-; LE-32BIT-NEXT:    bc 12, 30, .LBB11_77
-; LE-32BIT-NEXT:  # %bb.76:
-; LE-32BIT-NEXT:    ori 28, 26, 0
-; LE-32BIT-NEXT:    b .LBB11_78
-; LE-32BIT-NEXT:  .LBB11_77:
-; LE-32BIT-NEXT:    addi 28, 7, 0
-; LE-32BIT-NEXT:  .LBB11_78:
-; LE-32BIT-NEXT:    stw 3, 28(18)
-; LE-32BIT-NEXT:    or 7, 8, 27
-; LE-32BIT-NEXT:    or 4, 0, 6
-; LE-32BIT-NEXT:    or 3, 25, 5
-; LE-32BIT-NEXT:    bc 12, 4, .LBB11_80
-; LE-32BIT-NEXT:  # %bb.79:
-; LE-32BIT-NEXT:    ori 6, 28, 0
-; LE-32BIT-NEXT:    ori 4, 30, 0
-; LE-32BIT-NEXT:    ori 3, 24, 0
-; LE-32BIT-NEXT:    ori 12, 14, 0
-; LE-32BIT-NEXT:    b .LBB11_81
-; LE-32BIT-NEXT:  .LBB11_80:
-; LE-32BIT-NEXT:    addi 6, 7, 0
-; LE-32BIT-NEXT:    addi 12, 22, 0
-; LE-32BIT-NEXT:  .LBB11_81:
-; LE-32BIT-NEXT:    bc 12, 2, .LBB11_83
-; LE-32BIT-NEXT:  # %bb.82:
-; LE-32BIT-NEXT:    ori 5, 6, 0
-; LE-32BIT-NEXT:    b .LBB11_84
-; LE-32BIT-NEXT:  .LBB11_83:
-; LE-32BIT-NEXT:    addi 5, 9, 0
-; LE-32BIT-NEXT:    addi 4, 17, 0
-; LE-32BIT-NEXT:    addi 3, 31, 0
-; LE-32BIT-NEXT:  .LBB11_84:
-; LE-32BIT-NEXT:    stw 12, 12(18)
-; LE-32BIT-NEXT:    stw 5, 24(18)
-; LE-32BIT-NEXT:    stw 4, 16(18)
-; LE-32BIT-NEXT:    stw 3, 20(18)
-; LE-32BIT-NEXT:    lwz 12, 68(1)
-; LE-32BIT-NEXT:    lwz 31, 140(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    mtcrf 32, 12 # cr2
-; LE-32BIT-NEXT:    mtcrf 16, 12 # cr3
-; LE-32BIT-NEXT:    lwz 30, 136(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 29, 132(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 28, 128(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 27, 124(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 26, 120(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 25, 116(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 24, 112(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 23, 108(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 22, 104(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 21, 100(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 20, 96(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 19, 92(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 18, 88(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 17, 84(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 16, 80(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 15, 76(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 14, 72(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    addi 1, 1, 144
+; LE-32BIT-NEXT:    stw 11, 64(1)
+; LE-32BIT-NEXT:    stw 10, 60(1)
+; LE-32BIT-NEXT:    stw 9, 56(1)
+; LE-32BIT-NEXT:    stw 8, 52(1)
+; LE-32BIT-NEXT:    stw 7, 48(1)
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    stw 3, 40(1)
+; LE-32BIT-NEXT:    stw 3, 36(1)
+; LE-32BIT-NEXT:    stw 3, 32(1)
+; LE-32BIT-NEXT:    stw 3, 28(1)
+; LE-32BIT-NEXT:    stw 3, 24(1)
+; LE-32BIT-NEXT:    stw 3, 20(1)
+; LE-32BIT-NEXT:    stw 3, 16(1)
+; LE-32BIT-NEXT:    sub 3, 6, 4
+; LE-32BIT-NEXT:    lwz 4, 4(3)
+; LE-32BIT-NEXT:    lwz 6, 0(3)
+; LE-32BIT-NEXT:    lwz 7, 12(3)
+; LE-32BIT-NEXT:    lwz 8, 8(3)
+; LE-32BIT-NEXT:    lwz 9, 20(3)
+; LE-32BIT-NEXT:    lwz 10, 16(3)
+; LE-32BIT-NEXT:    lwz 11, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    stw 11, 24(5)
+; LE-32BIT-NEXT:    stw 3, 28(5)
+; LE-32BIT-NEXT:    stw 10, 16(5)
+; LE-32BIT-NEXT:    stw 9, 20(5)
+; LE-32BIT-NEXT:    stw 8, 8(5)
+; LE-32BIT-NEXT:    stw 7, 12(5)
+; LE-32BIT-NEXT:    stw 6, 0(5)
+; LE-32BIT-NEXT:    stw 4, 4(5)
+; LE-32BIT-NEXT:    addi 1, 1, 80
 ; LE-32BIT-NEXT:    blr
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1

diff  --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
index 92d582e27123f..dd150f4aee0fc 100644
--- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
@@ -206,93 +206,49 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; LE-32BIT-LABEL: lshr_16bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -32(1)
+; LE-32BIT-NEXT:    stwu 1, -48(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    li 6, 0
 ; LE-32BIT-NEXT:    lwz 4, 12(4)
-; LE-32BIT-NEXT:    li 8, 0
-; LE-32BIT-NEXT:    lwz 6, 8(3)
-; LE-32BIT-NEXT:    lwz 7, 12(3)
-; LE-32BIT-NEXT:    subfic 10, 4, 96
-; LE-32BIT-NEXT:    lwz 9, 4(3)
-; LE-32BIT-NEXT:    addi 11, 4, -64
-; LE-32BIT-NEXT:    lwz 3, 0(3)
-; LE-32BIT-NEXT:    cmplwi 4, 64
-; LE-32BIT-NEXT:    stw 27, 12(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 27, 9, 11
-; LE-32BIT-NEXT:    stw 28, 16(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 28, 3, 4
-; LE-32BIT-NEXT:    stw 30, 24(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 30, 4, 32
-; LE-32BIT-NEXT:    slw 10, 3, 10
-; LE-32BIT-NEXT:    addi 12, 4, -96
-; LE-32BIT-NEXT:    srw 0, 7, 4
-; LE-32BIT-NEXT:    or 10, 27, 10
-; LE-32BIT-NEXT:    slw 27, 6, 30
-; LE-32BIT-NEXT:    bc 12, 0, .LBB6_2
-; LE-32BIT-NEXT:  # %bb.1:
-; LE-32BIT-NEXT:    ori 28, 8, 0
-; LE-32BIT-NEXT:    b .LBB6_2
-; LE-32BIT-NEXT:  .LBB6_2:
-; LE-32BIT-NEXT:    stw 29, 20(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 29, 9, 4
-; LE-32BIT-NEXT:    or 0, 0, 27
-; LE-32BIT-NEXT:    slw 27, 3, 30
-; LE-32BIT-NEXT:    stw 28, 0(5)
-; LE-32BIT-NEXT:    subfic 28, 4, 64
-; LE-32BIT-NEXT:    srw 12, 3, 12
-; LE-32BIT-NEXT:    or 29, 29, 27
-; LE-32BIT-NEXT:    addi 27, 4, -32
-; LE-32BIT-NEXT:    or 10, 10, 12
-; LE-32BIT-NEXT:    subfic 12, 28, 32
-; LE-32BIT-NEXT:    slw 30, 9, 30
-; LE-32BIT-NEXT:    srw 12, 9, 12
-; LE-32BIT-NEXT:    slw 9, 9, 28
-; LE-32BIT-NEXT:    slw 28, 3, 28
-; LE-32BIT-NEXT:    srw 11, 3, 11
-; LE-32BIT-NEXT:    srw 3, 3, 27
-; LE-32BIT-NEXT:    srw 27, 6, 27
-; LE-32BIT-NEXT:    or 0, 0, 27
-; LE-32BIT-NEXT:    or 12, 28, 12
-; LE-32BIT-NEXT:    cmplwi 1, 4, 0
-; LE-32BIT-NEXT:    srw 4, 6, 4
-; LE-32BIT-NEXT:    or 3, 29, 3
-; LE-32BIT-NEXT:    or 9, 0, 9
-; LE-32BIT-NEXT:    or 12, 12, 30
-; LE-32BIT-NEXT:    bc 12, 0, .LBB6_4
-; LE-32BIT-NEXT:  # %bb.3:
-; LE-32BIT-NEXT:    ori 3, 8, 0
-; LE-32BIT-NEXT:    ori 8, 10, 0
-; LE-32BIT-NEXT:    b .LBB6_5
-; LE-32BIT-NEXT:  .LBB6_4:
-; LE-32BIT-NEXT:    addi 8, 9, 0
-; LE-32BIT-NEXT:  .LBB6_5:
-; LE-32BIT-NEXT:    or 4, 4, 12
-; LE-32BIT-NEXT:    stw 3, 4(5)
-; LE-32BIT-NEXT:    bc 12, 6, .LBB6_7
-; LE-32BIT-NEXT:  # %bb.6:
-; LE-32BIT-NEXT:    ori 3, 8, 0
-; LE-32BIT-NEXT:    b .LBB6_8
-; LE-32BIT-NEXT:  .LBB6_7:
-; LE-32BIT-NEXT:    addi 3, 7, 0
-; LE-32BIT-NEXT:  .LBB6_8:
-; LE-32BIT-NEXT:    bc 12, 0, .LBB6_10
-; LE-32BIT-NEXT:  # %bb.9:
-; LE-32BIT-NEXT:    ori 4, 11, 0
-; LE-32BIT-NEXT:    b .LBB6_10
-; LE-32BIT-NEXT:  .LBB6_10:
-; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    bc 12, 6, .LBB6_12
-; LE-32BIT-NEXT:  # %bb.11:
-; LE-32BIT-NEXT:    ori 3, 4, 0
-; LE-32BIT-NEXT:    b .LBB6_13
-; LE-32BIT-NEXT:  .LBB6_12:
-; LE-32BIT-NEXT:    addi 3, 6, 0
-; LE-32BIT-NEXT:  .LBB6_13:
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 3, 12(3)
+; LE-32BIT-NEXT:    stw 6, 28(1)
+; LE-32BIT-NEXT:    stw 6, 24(1)
+; LE-32BIT-NEXT:    stw 6, 20(1)
+; LE-32BIT-NEXT:    stw 6, 16(1)
+; LE-32BIT-NEXT:    addi 6, 1, 32
+; LE-32BIT-NEXT:    stw 7, 32(1)
+; LE-32BIT-NEXT:    rlwinm 7, 4, 29, 28, 31
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    sub 6, 6, 7
+; LE-32BIT-NEXT:    stw 9, 40(1)
+; LE-32BIT-NEXT:    li 3, 7
+; LE-32BIT-NEXT:    stw 8, 36(1)
+; LE-32BIT-NEXT:    nand 3, 4, 3
+; LE-32BIT-NEXT:    lwz 7, 4(6)
+; LE-32BIT-NEXT:    clrlwi 4, 4, 29
+; LE-32BIT-NEXT:    lwz 8, 8(6)
+; LE-32BIT-NEXT:    subfic 10, 4, 32
+; LE-32BIT-NEXT:    lwz 9, 0(6)
+; LE-32BIT-NEXT:    clrlwi 3, 3, 27
+; LE-32BIT-NEXT:    lwz 6, 12(6)
+; LE-32BIT-NEXT:    srw 11, 8, 4
+; LE-32BIT-NEXT:    slw 8, 8, 10
+; LE-32BIT-NEXT:    slw 10, 9, 10
+; LE-32BIT-NEXT:    srw 6, 6, 4
+; LE-32BIT-NEXT:    srw 9, 9, 4
+; LE-32BIT-NEXT:    srw 4, 7, 4
+; LE-32BIT-NEXT:    slwi 7, 7, 1
+; LE-32BIT-NEXT:    slw 3, 7, 3
+; LE-32BIT-NEXT:    or 6, 8, 6
+; LE-32BIT-NEXT:    or 4, 10, 4
+; LE-32BIT-NEXT:    or 3, 11, 3
+; LE-32BIT-NEXT:    stw 9, 0(5)
+; LE-32BIT-NEXT:    stw 6, 12(5)
+; LE-32BIT-NEXT:    stw 4, 4(5)
 ; LE-32BIT-NEXT:    stw 3, 8(5)
-; LE-32BIT-NEXT:    lwz 30, 24(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 29, 20(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 28, 16(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 27, 12(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    addi 1, 1, 32
+; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -337,93 +293,48 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; LE-32BIT-LABEL: shl_16bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -32(1)
-; LE-32BIT-NEXT:    lwz 4, 12(4)
-; LE-32BIT-NEXT:    li 8, 0
-; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    stwu 1, -48(1)
 ; LE-32BIT-NEXT:    lwz 7, 0(3)
-; LE-32BIT-NEXT:    subfic 10, 4, 96
+; LE-32BIT-NEXT:    li 6, 0
+; LE-32BIT-NEXT:    lwz 8, 4(3)
 ; LE-32BIT-NEXT:    lwz 9, 8(3)
-; LE-32BIT-NEXT:    addi 11, 4, -64
 ; LE-32BIT-NEXT:    lwz 3, 12(3)
-; LE-32BIT-NEXT:    cmplwi 4, 64
-; LE-32BIT-NEXT:    stw 27, 12(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    slw 27, 9, 11
-; LE-32BIT-NEXT:    stw 28, 16(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    slw 28, 3, 4
-; LE-32BIT-NEXT:    stw 30, 24(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 30, 4, 32
-; LE-32BIT-NEXT:    srw 10, 3, 10
-; LE-32BIT-NEXT:    addi 12, 4, -96
-; LE-32BIT-NEXT:    slw 0, 7, 4
-; LE-32BIT-NEXT:    or 10, 27, 10
-; LE-32BIT-NEXT:    srw 27, 6, 30
-; LE-32BIT-NEXT:    bc 12, 0, .LBB7_2
-; LE-32BIT-NEXT:  # %bb.1:
-; LE-32BIT-NEXT:    ori 28, 8, 0
-; LE-32BIT-NEXT:    b .LBB7_2
-; LE-32BIT-NEXT:  .LBB7_2:
-; LE-32BIT-NEXT:    stw 29, 20(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    slw 29, 9, 4
-; LE-32BIT-NEXT:    or 0, 0, 27
-; LE-32BIT-NEXT:    srw 27, 3, 30
-; LE-32BIT-NEXT:    stw 28, 12(5)
-; LE-32BIT-NEXT:    subfic 28, 4, 64
-; LE-32BIT-NEXT:    slw 12, 3, 12
-; LE-32BIT-NEXT:    or 29, 29, 27
-; LE-32BIT-NEXT:    addi 27, 4, -32
-; LE-32BIT-NEXT:    or 10, 10, 12
-; LE-32BIT-NEXT:    subfic 12, 28, 32
-; LE-32BIT-NEXT:    srw 30, 9, 30
-; LE-32BIT-NEXT:    slw 12, 9, 12
-; LE-32BIT-NEXT:    srw 9, 9, 28
-; LE-32BIT-NEXT:    srw 28, 3, 28
-; LE-32BIT-NEXT:    slw 11, 3, 11
-; LE-32BIT-NEXT:    slw 3, 3, 27
-; LE-32BIT-NEXT:    slw 27, 6, 27
-; LE-32BIT-NEXT:    or 0, 0, 27
-; LE-32BIT-NEXT:    or 12, 28, 12
-; LE-32BIT-NEXT:    cmplwi 1, 4, 0
-; LE-32BIT-NEXT:    slw 4, 6, 4
-; LE-32BIT-NEXT:    or 3, 29, 3
-; LE-32BIT-NEXT:    or 9, 0, 9
-; LE-32BIT-NEXT:    or 12, 12, 30
-; LE-32BIT-NEXT:    bc 12, 0, .LBB7_4
-; LE-32BIT-NEXT:  # %bb.3:
-; LE-32BIT-NEXT:    ori 3, 8, 0
-; LE-32BIT-NEXT:    ori 8, 10, 0
-; LE-32BIT-NEXT:    b .LBB7_5
-; LE-32BIT-NEXT:  .LBB7_4:
-; LE-32BIT-NEXT:    addi 8, 9, 0
-; LE-32BIT-NEXT:  .LBB7_5:
-; LE-32BIT-NEXT:    or 4, 4, 12
-; LE-32BIT-NEXT:    stw 3, 8(5)
-; LE-32BIT-NEXT:    bc 12, 6, .LBB7_7
-; LE-32BIT-NEXT:  # %bb.6:
-; LE-32BIT-NEXT:    ori 3, 8, 0
-; LE-32BIT-NEXT:    b .LBB7_8
-; LE-32BIT-NEXT:  .LBB7_7:
-; LE-32BIT-NEXT:    addi 3, 7, 0
-; LE-32BIT-NEXT:  .LBB7_8:
-; LE-32BIT-NEXT:    bc 12, 0, .LBB7_10
-; LE-32BIT-NEXT:  # %bb.9:
-; LE-32BIT-NEXT:    ori 4, 11, 0
-; LE-32BIT-NEXT:    b .LBB7_10
-; LE-32BIT-NEXT:  .LBB7_10:
+; LE-32BIT-NEXT:    lwz 4, 12(4)
+; LE-32BIT-NEXT:    stw 6, 44(1)
+; LE-32BIT-NEXT:    stw 6, 40(1)
+; LE-32BIT-NEXT:    stw 6, 36(1)
+; LE-32BIT-NEXT:    stw 6, 32(1)
+; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 28, 31
+; LE-32BIT-NEXT:    stw 3, 28(1)
+; LE-32BIT-NEXT:    addi 3, 1, 16
+; LE-32BIT-NEXT:    stw 9, 24(1)
+; LE-32BIT-NEXT:    stw 8, 20(1)
+; LE-32BIT-NEXT:    stw 7, 16(1)
+; LE-32BIT-NEXT:    li 7, 7
+; LE-32BIT-NEXT:    lwzux 3, 6, 3
+; LE-32BIT-NEXT:    nand 7, 4, 7
+; LE-32BIT-NEXT:    clrlwi 4, 4, 29
+; LE-32BIT-NEXT:    subfic 10, 4, 32
+; LE-32BIT-NEXT:    lwz 8, 8(6)
+; LE-32BIT-NEXT:    clrlwi 7, 7, 27
+; LE-32BIT-NEXT:    lwz 9, 4(6)
+; LE-32BIT-NEXT:    slw 3, 3, 4
+; LE-32BIT-NEXT:    lwz 6, 12(6)
+; LE-32BIT-NEXT:    slw 11, 9, 4
+; LE-32BIT-NEXT:    srw 9, 9, 10
+; LE-32BIT-NEXT:    srw 10, 6, 10
+; LE-32BIT-NEXT:    slw 6, 6, 4
+; LE-32BIT-NEXT:    slw 4, 8, 4
+; LE-32BIT-NEXT:    srwi 8, 8, 1
+; LE-32BIT-NEXT:    srw 7, 8, 7
+; LE-32BIT-NEXT:    or 3, 3, 9
+; LE-32BIT-NEXT:    or 4, 4, 10
 ; LE-32BIT-NEXT:    stw 3, 0(5)
-; LE-32BIT-NEXT:    bc 12, 6, .LBB7_12
-; LE-32BIT-NEXT:  # %bb.11:
-; LE-32BIT-NEXT:    ori 3, 4, 0
-; LE-32BIT-NEXT:    b .LBB7_13
-; LE-32BIT-NEXT:  .LBB7_12:
-; LE-32BIT-NEXT:    addi 3, 6, 0
-; LE-32BIT-NEXT:  .LBB7_13:
+; LE-32BIT-NEXT:    or 3, 11, 7
+; LE-32BIT-NEXT:    stw 6, 12(5)
+; LE-32BIT-NEXT:    stw 4, 8(5)
 ; LE-32BIT-NEXT:    stw 3, 4(5)
-; LE-32BIT-NEXT:    lwz 30, 24(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 29, 20(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 28, 16(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 27, 12(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    addi 1, 1, 32
+; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -474,101 +385,49 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; LE-32BIT-LABEL: ashr_16bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -32(1)
-; LE-32BIT-NEXT:    lwz 4, 12(4)
-; LE-32BIT-NEXT:    lwz 6, 8(3)
-; LE-32BIT-NEXT:    lwz 7, 12(3)
-; LE-32BIT-NEXT:    subfic 9, 4, 96
+; LE-32BIT-NEXT:    stwu 1, -48(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    li 6, 7
 ; LE-32BIT-NEXT:    lwz 8, 4(3)
-; LE-32BIT-NEXT:    addi 10, 4, -64
-; LE-32BIT-NEXT:    lwz 3, 0(3)
-; LE-32BIT-NEXT:    subfic 0, 4, 32
-; LE-32BIT-NEXT:    stw 27, 12(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 27, 8, 10
-; LE-32BIT-NEXT:    slw 9, 3, 9
-; LE-32BIT-NEXT:    srw 12, 7, 4
-; LE-32BIT-NEXT:    or 9, 27, 9
-; LE-32BIT-NEXT:    slw 27, 6, 0
-; LE-32BIT-NEXT:    stw 29, 20(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 29, 8, 4
-; LE-32BIT-NEXT:    or 12, 12, 27
-; LE-32BIT-NEXT:    slw 27, 3, 0
-; LE-32BIT-NEXT:    stw 28, 16(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    cmplwi 4, 64
-; LE-32BIT-NEXT:    srawi 28, 3, 31
-; LE-32BIT-NEXT:    or 29, 29, 27
-; LE-32BIT-NEXT:    sraw 27, 3, 4
-; LE-32BIT-NEXT:    addi 11, 4, -96
-; LE-32BIT-NEXT:    bc 12, 0, .LBB8_2
-; LE-32BIT-NEXT:  # %bb.1:
-; LE-32BIT-NEXT:    ori 27, 28, 0
-; LE-32BIT-NEXT:    b .LBB8_2
-; LE-32BIT-NEXT:  .LBB8_2:
-; LE-32BIT-NEXT:    cmpwi 1, 11, 1
-; LE-32BIT-NEXT:    sraw 11, 3, 11
-; LE-32BIT-NEXT:    stw 27, 0(5)
-; LE-32BIT-NEXT:    subfic 27, 4, 64
-; LE-32BIT-NEXT:    stw 30, 24(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    addi 30, 4, -32
-; LE-32BIT-NEXT:    bc 12, 4, .LBB8_4
-; LE-32BIT-NEXT:  # %bb.3:
-; LE-32BIT-NEXT:    ori 9, 11, 0
-; LE-32BIT-NEXT:    b .LBB8_4
-; LE-32BIT-NEXT:  .LBB8_4:
-; LE-32BIT-NEXT:    subfic 11, 27, 32
-; LE-32BIT-NEXT:    slw 0, 8, 0
-; LE-32BIT-NEXT:    srw 11, 8, 11
-; LE-32BIT-NEXT:    slw 8, 8, 27
-; LE-32BIT-NEXT:    slw 27, 3, 27
-; LE-32BIT-NEXT:    sraw 10, 3, 10
-; LE-32BIT-NEXT:    sraw 3, 3, 30
-; LE-32BIT-NEXT:    cmpwi 1, 30, 1
-; LE-32BIT-NEXT:    srw 30, 6, 30
-; LE-32BIT-NEXT:    or 12, 12, 30
-; LE-32BIT-NEXT:    or 11, 27, 11
-; LE-32BIT-NEXT:    bc 12, 4, .LBB8_5
-; LE-32BIT-NEXT:    b .LBB8_6
-; LE-32BIT-NEXT:  .LBB8_5:
-; LE-32BIT-NEXT:    addi 3, 29, 0
-; LE-32BIT-NEXT:  .LBB8_6:
-; LE-32BIT-NEXT:    cmplwi 1, 4, 0
-; LE-32BIT-NEXT:    srw 4, 6, 4
-; LE-32BIT-NEXT:    or 8, 12, 8
-; LE-32BIT-NEXT:    or 11, 11, 0
-; LE-32BIT-NEXT:    bc 12, 0, .LBB8_8
-; LE-32BIT-NEXT:  # %bb.7:
-; LE-32BIT-NEXT:    ori 3, 28, 0
-; LE-32BIT-NEXT:    ori 8, 9, 0
-; LE-32BIT-NEXT:    b .LBB8_8
-; LE-32BIT-NEXT:  .LBB8_8:
-; LE-32BIT-NEXT:    or 4, 4, 11
-; LE-32BIT-NEXT:    stw 3, 4(5)
-; LE-32BIT-NEXT:    bc 12, 6, .LBB8_10
-; LE-32BIT-NEXT:  # %bb.9:
-; LE-32BIT-NEXT:    ori 3, 8, 0
-; LE-32BIT-NEXT:    b .LBB8_11
-; LE-32BIT-NEXT:  .LBB8_10:
-; LE-32BIT-NEXT:    addi 3, 7, 0
-; LE-32BIT-NEXT:  .LBB8_11:
-; LE-32BIT-NEXT:    bc 12, 0, .LBB8_13
-; LE-32BIT-NEXT:  # %bb.12:
-; LE-32BIT-NEXT:    ori 4, 10, 0
-; LE-32BIT-NEXT:    b .LBB8_13
-; LE-32BIT-NEXT:  .LBB8_13:
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 3, 12(3)
+; LE-32BIT-NEXT:    lwz 4, 12(4)
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    srawi 3, 7, 31
+; LE-32BIT-NEXT:    stw 8, 36(1)
+; LE-32BIT-NEXT:    rlwinm 8, 4, 29, 28, 31
+; LE-32BIT-NEXT:    stw 7, 32(1)
+; LE-32BIT-NEXT:    addi 7, 1, 32
+; LE-32BIT-NEXT:    stw 9, 40(1)
+; LE-32BIT-NEXT:    nand 6, 4, 6
+; LE-32BIT-NEXT:    stw 3, 28(1)
+; LE-32BIT-NEXT:    clrlwi 4, 4, 29
+; LE-32BIT-NEXT:    stw 3, 24(1)
+; LE-32BIT-NEXT:    subfic 10, 4, 32
+; LE-32BIT-NEXT:    stw 3, 20(1)
+; LE-32BIT-NEXT:    clrlwi 6, 6, 27
+; LE-32BIT-NEXT:    stw 3, 16(1)
+; LE-32BIT-NEXT:    sub 3, 7, 8
+; LE-32BIT-NEXT:    lwz 7, 4(3)
+; LE-32BIT-NEXT:    lwz 8, 8(3)
+; LE-32BIT-NEXT:    lwz 9, 0(3)
+; LE-32BIT-NEXT:    lwz 3, 12(3)
+; LE-32BIT-NEXT:    srw 11, 8, 4
+; LE-32BIT-NEXT:    slw 8, 8, 10
+; LE-32BIT-NEXT:    slw 10, 9, 10
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    sraw 9, 9, 4
+; LE-32BIT-NEXT:    srw 4, 7, 4
+; LE-32BIT-NEXT:    slwi 7, 7, 1
+; LE-32BIT-NEXT:    or 3, 8, 3
+; LE-32BIT-NEXT:    slw 6, 7, 6
 ; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    bc 12, 6, .LBB8_15
-; LE-32BIT-NEXT:  # %bb.14:
-; LE-32BIT-NEXT:    ori 3, 4, 0
-; LE-32BIT-NEXT:    b .LBB8_16
-; LE-32BIT-NEXT:  .LBB8_15:
-; LE-32BIT-NEXT:    addi 3, 6, 0
-; LE-32BIT-NEXT:  .LBB8_16:
+; LE-32BIT-NEXT:    or 3, 10, 4
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    or 3, 11, 6
+; LE-32BIT-NEXT:    stw 9, 0(5)
 ; LE-32BIT-NEXT:    stw 3, 8(5)
-; LE-32BIT-NEXT:    lwz 30, 24(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 29, 20(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 28, 16(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 27, 12(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    addi 1, 1, 32
+; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -580,598 +439,183 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-LABEL: lshr_32bytes:
 ; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    li 6, 16
+; LE-64BIT-NEXT:    lxvd2x 2, 0, 3
+; LE-64BIT-NEXT:    xxlxor 0, 0, 0
 ; LE-64BIT-NEXT:    lwz 4, 0(4)
-; LE-64BIT-NEXT:    ld 7, 0(3)
-; LE-64BIT-NEXT:    ld 8, 8(3)
-; LE-64BIT-NEXT:    ld 9, 16(3)
-; LE-64BIT-NEXT:    li 6, 0
+; LE-64BIT-NEXT:    addi 7, 1, -64
+; LE-64BIT-NEXT:    li 8, 32
+; LE-64BIT-NEXT:    lxvd2x 1, 3, 6
+; LE-64BIT-NEXT:    li 3, 48
+; LE-64BIT-NEXT:    stxvd2x 0, 7, 3
+; LE-64BIT-NEXT:    stxvd2x 0, 7, 8
+; LE-64BIT-NEXT:    rlwinm 3, 4, 29, 27, 31
+; LE-64BIT-NEXT:    stxvd2x 1, 7, 6
+; LE-64BIT-NEXT:    stxvd2x 2, 0, 7
+; LE-64BIT-NEXT:    ldux 6, 3, 7
+; LE-64BIT-NEXT:    li 7, 7
+; LE-64BIT-NEXT:    nand 7, 4, 7
+; LE-64BIT-NEXT:    clrlwi 4, 4, 29
+; LE-64BIT-NEXT:    clrlwi 7, 7, 26
+; LE-64BIT-NEXT:    subfic 11, 4, 64
+; LE-64BIT-NEXT:    ld 8, 16(3)
+; LE-64BIT-NEXT:    ld 9, 8(3)
 ; LE-64BIT-NEXT:    ld 3, 24(3)
-; LE-64BIT-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 21, -88(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 24, -64(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    subfic 28, 4, 64
-; LE-64BIT-NEXT:    std 25, -56(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    subfic 11, 4, 192
-; LE-64BIT-NEXT:    addi 0, 4, -128
-; LE-64BIT-NEXT:    subfic 25, 4, 128
-; LE-64BIT-NEXT:    std 27, -40(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 29, -24(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    srd 29, 9, 4
-; LE-64BIT-NEXT:    addi 27, 4, -64
-; LE-64BIT-NEXT:    std 22, -80(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    sld 24, 8, 28
-; LE-64BIT-NEXT:    sld 21, 9, 28
-; LE-64BIT-NEXT:    std 26, -48(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    sld 28, 3, 28
-; LE-64BIT-NEXT:    srd 10, 7, 4
-; LE-64BIT-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    addi 30, 4, -192
+; LE-64BIT-NEXT:    srd 6, 6, 4
+; LE-64BIT-NEXT:    sldi 10, 8, 1
+; LE-64BIT-NEXT:    srd 8, 8, 4
+; LE-64BIT-NEXT:    sld 7, 10, 7
+; LE-64BIT-NEXT:    srd 10, 9, 4
+; LE-64BIT-NEXT:    sld 9, 9, 11
 ; LE-64BIT-NEXT:    sld 11, 3, 11
-; LE-64BIT-NEXT:    subfic 22, 25, 64
-; LE-64BIT-NEXT:    or 29, 29, 28
-; LE-64BIT-NEXT:    srd 26, 9, 0
-; LE-64BIT-NEXT:    srd 28, 3, 27
-; LE-64BIT-NEXT:    std 23, -72(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    or 10, 10, 24
-; LE-64BIT-NEXT:    ld 24, -64(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    srd 30, 3, 30
-; LE-64BIT-NEXT:    srd 23, 8, 27
-; LE-64BIT-NEXT:    or 11, 26, 11
-; LE-64BIT-NEXT:    or 29, 29, 28
-; LE-64BIT-NEXT:    srd 27, 9, 22
-; LE-64BIT-NEXT:    sld 28, 3, 25
-; LE-64BIT-NEXT:    or 10, 10, 23
-; LE-64BIT-NEXT:    or 11, 11, 30
-; LE-64BIT-NEXT:    ld 26, -48(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    sld 9, 9, 25
-; LE-64BIT-NEXT:    or 30, 28, 27
-; LE-64BIT-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 25, -56(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 23, -72(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    cmplwi 4, 128
-; LE-64BIT-NEXT:    srd 12, 8, 4
-; LE-64BIT-NEXT:    or 9, 10, 9
-; LE-64BIT-NEXT:    or 30, 30, 21
-; LE-64BIT-NEXT:    ld 22, -80(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 21, -88(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    cmplwi 1, 4, 0
-; LE-64BIT-NEXT:    srd 10, 3, 0
-; LE-64BIT-NEXT:    isellt 9, 9, 11
-; LE-64BIT-NEXT:    or 11, 12, 30
-; LE-64BIT-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    isel 7, 7, 9, 6
+; LE-64BIT-NEXT:    or 7, 10, 7
+; LE-64BIT-NEXT:    or 8, 11, 8
+; LE-64BIT-NEXT:    or 6, 9, 6
+; LE-64BIT-NEXT:    std 7, 8(5)
 ; LE-64BIT-NEXT:    srd 3, 3, 4
-; LE-64BIT-NEXT:    isellt 9, 11, 10
-; LE-64BIT-NEXT:    std 7, 0(5)
-; LE-64BIT-NEXT:    isellt 0, 29, 6
-; LE-64BIT-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    isel 4, 8, 9, 6
-; LE-64BIT-NEXT:    std 0, 16(5)
-; LE-64BIT-NEXT:    isellt 3, 3, 6
-; LE-64BIT-NEXT:    std 4, 8(5)
+; LE-64BIT-NEXT:    std 6, 0(5)
+; LE-64BIT-NEXT:    std 8, 16(5)
 ; LE-64BIT-NEXT:    std 3, 24(5)
 ; LE-64BIT-NEXT:    blr
 ;
 ; BE-LABEL: lshr_32bytes:
 ; BE:       # %bb.0:
-; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    ld 7, 24(3)
+; BE-NEXT:    ld 6, 0(3)
+; BE-NEXT:    ld 7, 8(3)
 ; BE-NEXT:    ld 8, 16(3)
-; BE-NEXT:    ld 9, 8(3)
-; BE-NEXT:    ld 3, 0(3)
-; BE-NEXT:    std 27, -40(1) # 8-byte Folded Spill
-; BE-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; BE-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; BE-NEXT:    std 29, -24(1) # 8-byte Folded Spill
-; BE-NEXT:    li 6, 0
-; BE-NEXT:    subfic 10, 4, 192
-; BE-NEXT:    addi 11, 4, -128
-; BE-NEXT:    addi 12, 4, -192
-; BE-NEXT:    subfic 30, 4, 64
-; BE-NEXT:    sld 10, 3, 10
-; BE-NEXT:    srd 27, 9, 11
-; BE-NEXT:    srd 0, 7, 4
-; BE-NEXT:    addi 29, 4, -64
-; BE-NEXT:    subfic 28, 4, 128
-; BE-NEXT:    srd 12, 3, 12
-; BE-NEXT:    or 10, 27, 10
-; BE-NEXT:    sld 27, 8, 30
-; BE-NEXT:    or 10, 10, 12
-; BE-NEXT:    or 0, 0, 27
-; BE-NEXT:    srd 27, 8, 29
-; BE-NEXT:    subfic 12, 28, 64
-; BE-NEXT:    or 0, 0, 27
-; BE-NEXT:    sld 27, 3, 28
-; BE-NEXT:    srd 12, 9, 12
-; BE-NEXT:    sld 28, 9, 28
-; BE-NEXT:    cmplwi 4, 128
-; BE-NEXT:    or 12, 27, 12
-; BE-NEXT:    or 28, 0, 28
-; BE-NEXT:    sld 0, 9, 30
+; BE-NEXT:    ld 3, 24(3)
+; BE-NEXT:    lwz 4, 28(4)
+; BE-NEXT:    addi 9, 1, -64
+; BE-NEXT:    li 10, 0
+; BE-NEXT:    addi 11, 1, -32
+; BE-NEXT:    std 3, 56(9)
+; BE-NEXT:    rlwinm 3, 4, 29, 27, 31
+; BE-NEXT:    neg 3, 3
+; BE-NEXT:    std 10, 24(9)
+; BE-NEXT:    std 10, 16(9)
+; BE-NEXT:    std 10, 8(9)
+; BE-NEXT:    std 10, -64(1)
+; BE-NEXT:    std 8, 48(9)
+; BE-NEXT:    std 7, 40(9)
+; BE-NEXT:    std 6, 32(9)
+; BE-NEXT:    extsw 3, 3
+; BE-NEXT:    ldux 3, 11, 3
+; BE-NEXT:    li 6, 7
+; BE-NEXT:    nand 6, 4, 6
+; BE-NEXT:    clrlwi 4, 4, 29
+; BE-NEXT:    clrlwi 6, 6, 26
+; BE-NEXT:    ld 7, 8(11)
+; BE-NEXT:    ld 8, 16(11)
+; BE-NEXT:    ld 9, 24(11)
+; BE-NEXT:    subfic 10, 4, 64
+; BE-NEXT:    sldi 11, 7, 1
+; BE-NEXT:    srd 7, 7, 4
 ; BE-NEXT:    srd 9, 9, 4
-; BE-NEXT:    srd 11, 3, 11
-; BE-NEXT:    cmplwi 1, 4, 0
-; BE-NEXT:    or 12, 12, 0
-; BE-NEXT:    srd 0, 8, 4
-; BE-NEXT:    bc 12, 0, .LBB9_1
-; BE-NEXT:    b .LBB9_2
-; BE-NEXT:  .LBB9_1:
-; BE-NEXT:    addi 10, 28, 0
-; BE-NEXT:  .LBB9_2:
-; BE-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
-; BE-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
-; BE-NEXT:    or 12, 0, 12
-; BE-NEXT:    sld 0, 3, 30
-; BE-NEXT:    srd 30, 3, 29
-; BE-NEXT:    bc 12, 0, .LBB9_3
-; BE-NEXT:    b .LBB9_4
-; BE-NEXT:  .LBB9_3:
-; BE-NEXT:    addi 11, 12, 0
-; BE-NEXT:  .LBB9_4:
+; BE-NEXT:    sld 6, 11, 6
+; BE-NEXT:    sld 11, 3, 10
+; BE-NEXT:    sld 10, 8, 10
+; BE-NEXT:    srd 8, 8, 4
 ; BE-NEXT:    srd 3, 3, 4
-; BE-NEXT:    bc 12, 6, .LBB9_6
-; BE-NEXT:  # %bb.5:
-; BE-NEXT:    ori 4, 10, 0
-; BE-NEXT:    b .LBB9_7
-; BE-NEXT:  .LBB9_6:
-; BE-NEXT:    addi 4, 7, 0
-; BE-NEXT:  .LBB9_7:
-; BE-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
-; BE-NEXT:    or 9, 9, 0
-; BE-NEXT:    or 9, 9, 30
-; BE-NEXT:    bc 12, 6, .LBB9_9
-; BE-NEXT:  # %bb.8:
-; BE-NEXT:    ori 7, 11, 0
-; BE-NEXT:    b .LBB9_10
-; BE-NEXT:  .LBB9_9:
-; BE-NEXT:    addi 7, 8, 0
-; BE-NEXT:  .LBB9_10:
-; BE-NEXT:    bc 12, 0, .LBB9_12
-; BE-NEXT:  # %bb.11:
-; BE-NEXT:    ori 8, 6, 0
-; BE-NEXT:    ori 3, 6, 0
-; BE-NEXT:    b .LBB9_13
-; BE-NEXT:  .LBB9_12:
-; BE-NEXT:    addi 8, 9, 0
-; BE-NEXT:  .LBB9_13:
-; BE-NEXT:    std 4, 24(5)
-; BE-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
+; BE-NEXT:    or 7, 11, 7
+; BE-NEXT:    or 6, 8, 6
+; BE-NEXT:    or 8, 10, 9
 ; BE-NEXT:    std 3, 0(5)
-; BE-NEXT:    std 8, 8(5)
-; BE-NEXT:    std 7, 16(5)
+; BE-NEXT:    std 8, 24(5)
+; BE-NEXT:    std 7, 8(5)
+; BE-NEXT:    std 6, 16(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: lshr_32bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -144(1)
-; LE-32BIT-NEXT:    mfcr 12
-; LE-32BIT-NEXT:    stw 14, 72(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 15, 76(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 16, 80(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 17, 84(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 18, 88(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 19, 92(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 20, 96(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 21, 100(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 22, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 23, 108(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 24, 112(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 25, 116(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 26, 120(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 27, 124(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 28, 128(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 29, 132(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 30, 136(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 31, 140(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 12, 68(1)
-; LE-32BIT-NEXT:    lwz 30, 28(4)
-; LE-32BIT-NEXT:    lwz 9, 28(3)
-; LE-32BIT-NEXT:    lwz 10, 4(3)
-; LE-32BIT-NEXT:    subfic 21, 30, 224
-; LE-32BIT-NEXT:    lwz 11, 0(3)
-; LE-32BIT-NEXT:    subfic 4, 30, 160
-; LE-32BIT-NEXT:    stw 5, 64(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    addi 0, 30, -128
-; LE-32BIT-NEXT:    lwz 5, 24(3)
-; LE-32BIT-NEXT:    subfic 28, 30, 96
-; LE-32BIT-NEXT:    lwz 19, 20(3)
-; LE-32BIT-NEXT:    addi 29, 30, -64
-; LE-32BIT-NEXT:    lwz 8, 16(3)
-; LE-32BIT-NEXT:    srw 20, 9, 30
-; LE-32BIT-NEXT:    lwz 12, 12(3)
-; LE-32BIT-NEXT:    slw 21, 11, 21
-; LE-32BIT-NEXT:    lwz 6, 8(3)
-; LE-32BIT-NEXT:    addi 3, 30, -192
-; LE-32BIT-NEXT:    stw 9, 60(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 9, 30, 32
-; LE-32BIT-NEXT:    srw 16, 10, 3
-; LE-32BIT-NEXT:    stw 3, 56(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    slw 15, 6, 4
-; LE-32BIT-NEXT:    srw 14, 12, 0
-; LE-32BIT-NEXT:    slw 31, 8, 28
-; LE-32BIT-NEXT:    srw 3, 19, 29
-; LE-32BIT-NEXT:    or 21, 16, 21
-; LE-32BIT-NEXT:    slw 16, 5, 9
-; LE-32BIT-NEXT:    srw 25, 19, 30
-; LE-32BIT-NEXT:    or 15, 14, 15
-; LE-32BIT-NEXT:    slw 14, 8, 9
-; LE-32BIT-NEXT:    or 3, 3, 31
-; LE-32BIT-NEXT:    slw 31, 11, 4
-; LE-32BIT-NEXT:    or 20, 20, 16
-; LE-32BIT-NEXT:    srw 16, 10, 0
-; LE-32BIT-NEXT:    addi 26, 30, -224
-; LE-32BIT-NEXT:    stw 4, 36(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 4, 25, 14
-; LE-32BIT-NEXT:    slw 14, 11, 28
-; LE-32BIT-NEXT:    or 16, 16, 31
-; LE-32BIT-NEXT:    srw 31, 10, 29
-; LE-32BIT-NEXT:    addi 23, 30, -160
-; LE-32BIT-NEXT:    srw 18, 12, 30
-; LE-32BIT-NEXT:    stw 0, 40(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 14, 31, 14
-; LE-32BIT-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    mr 29, 6
-; LE-32BIT-NEXT:    slw 31, 6, 9
-; LE-32BIT-NEXT:    srw 0, 11, 26
-; LE-32BIT-NEXT:    addi 24, 30, -96
-; LE-32BIT-NEXT:    srw 17, 10, 30
-; LE-32BIT-NEXT:    or 18, 18, 31
-; LE-32BIT-NEXT:    slw 31, 11, 9
-; LE-32BIT-NEXT:    or 6, 21, 0
-; LE-32BIT-NEXT:    srw 0, 29, 23
-; LE-32BIT-NEXT:    or 17, 17, 31
-; LE-32BIT-NEXT:    addi 31, 30, -32
-; LE-32BIT-NEXT:    or 0, 15, 0
-; LE-32BIT-NEXT:    srw 15, 8, 24
-; LE-32BIT-NEXT:    or 3, 3, 15
-; LE-32BIT-NEXT:    srw 15, 5, 31
-; LE-32BIT-NEXT:    or 20, 20, 15
-; LE-32BIT-NEXT:    srw 15, 8, 31
-; LE-32BIT-NEXT:    stw 3, 28(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 3, 4, 15
-; LE-32BIT-NEXT:    srw 23, 11, 23
-; LE-32BIT-NEXT:    stw 3, 48(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 15, 30, 64
-; LE-32BIT-NEXT:    or 3, 16, 23
-; LE-32BIT-NEXT:    stw 3, 44(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 3, 15, 32
-; LE-32BIT-NEXT:    slw 16, 29, 15
-; LE-32BIT-NEXT:    srw 22, 12, 3
-; LE-32BIT-NEXT:    or 21, 16, 22
-; LE-32BIT-NEXT:    subfic 16, 30, 128
-; LE-32BIT-NEXT:    mr 7, 10
-; LE-32BIT-NEXT:    mr 10, 5
-; LE-32BIT-NEXT:    subfic 5, 16, 32
-; LE-32BIT-NEXT:    stw 6, 32(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    mr 6, 24
-; LE-32BIT-NEXT:    slw 4, 11, 16
-; LE-32BIT-NEXT:    srw 24, 7, 5
-; LE-32BIT-NEXT:    or 22, 4, 24
-; LE-32BIT-NEXT:    slw 24, 29, 16
-; LE-32BIT-NEXT:    srw 27, 12, 5
-; LE-32BIT-NEXT:    or 27, 24, 27
-; LE-32BIT-NEXT:    slw 24, 8, 15
-; LE-32BIT-NEXT:    srw 26, 19, 3
-; LE-32BIT-NEXT:    or 26, 24, 26
-; LE-32BIT-NEXT:    subfic 24, 30, 192
-; LE-32BIT-NEXT:    mr 25, 28
-; LE-32BIT-NEXT:    subfic 28, 24, 32
-; LE-32BIT-NEXT:    mr 23, 19
-; LE-32BIT-NEXT:    srw 28, 7, 28
-; LE-32BIT-NEXT:    slw 19, 11, 24
-; LE-32BIT-NEXT:    mr 4, 29
-; LE-32BIT-NEXT:    or 28, 19, 28
-; LE-32BIT-NEXT:    srw 19, 11, 6
-; LE-32BIT-NEXT:    or 19, 14, 19
-; LE-32BIT-NEXT:    srw 14, 4, 31
-; LE-32BIT-NEXT:    or 6, 18, 14
-; LE-32BIT-NEXT:    lwz 18, 64(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    srw 3, 7, 3
-; LE-32BIT-NEXT:    slw 14, 11, 15
-; LE-32BIT-NEXT:    cmplwi 1, 30, 64
-; LE-32BIT-NEXT:    cmplwi 30, 128
-; LE-32BIT-NEXT:    slw 24, 7, 24
-; LE-32BIT-NEXT:    mr 29, 12
-; LE-32BIT-NEXT:    or 12, 14, 3
-; LE-32BIT-NEXT:    srw 14, 11, 31
-; LE-32BIT-NEXT:    crnand 28, 0, 4
-; LE-32BIT-NEXT:    srw 31, 11, 30
-; LE-32BIT-NEXT:    or 24, 0, 24
-; LE-32BIT-NEXT:    slw 0, 23, 15
-; LE-32BIT-NEXT:    or 17, 17, 14
-; LE-32BIT-NEXT:    bc 12, 28, .LBB9_2
-; LE-32BIT-NEXT:  # %bb.1:
-; LE-32BIT-NEXT:    ori 14, 31, 0
-; LE-32BIT-NEXT:    b .LBB9_3
-; LE-32BIT-NEXT:  .LBB9_2:
-; LE-32BIT-NEXT:    li 14, 0
-; LE-32BIT-NEXT:  .LBB9_3:
-; LE-32BIT-NEXT:    or 20, 20, 0
-; LE-32BIT-NEXT:    subfic 0, 16, 64
-; LE-32BIT-NEXT:    stw 14, 0(18)
-; LE-32BIT-NEXT:    subfic 14, 0, 32
-; LE-32BIT-NEXT:    slw 14, 4, 14
-; LE-32BIT-NEXT:    srw 31, 29, 0
-; LE-32BIT-NEXT:    or 14, 31, 14
-; LE-32BIT-NEXT:    slw 31, 29, 9
-; LE-32BIT-NEXT:    mr 3, 29
-; LE-32BIT-NEXT:    or 29, 21, 31
-; LE-32BIT-NEXT:    slw 31, 7, 25
-; LE-32BIT-NEXT:    stw 29, 20(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 29, 22, 31
-; LE-32BIT-NEXT:    slw 31, 3, 25
-; LE-32BIT-NEXT:    or 27, 27, 31
-; LE-32BIT-NEXT:    stw 27, 24(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    slw 31, 23, 9
-; LE-32BIT-NEXT:    lwz 27, 36(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 26, 26, 31
-; LE-32BIT-NEXT:    slw 25, 7, 9
-; LE-32BIT-NEXT:    or 12, 12, 25
-; LE-32BIT-NEXT:    slw 31, 7, 27
-; LE-32BIT-NEXT:    or 28, 28, 31
-; LE-32BIT-NEXT:    slw 31, 7, 15
-; LE-32BIT-NEXT:    or 22, 6, 31
-; LE-32BIT-NEXT:    lwz 31, 40(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    srw 0, 4, 0
-; LE-32BIT-NEXT:    lwz 6, 32(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 27, 29, 0
-; LE-32BIT-NEXT:    cmplwi 6, 31, 64
-; LE-32BIT-NEXT:    srw 0, 10, 30
-; LE-32BIT-NEXT:    bc 12, 24, .LBB9_5
-; LE-32BIT-NEXT:  # %bb.4:
-; LE-32BIT-NEXT:    ori 25, 6, 0
-; LE-32BIT-NEXT:    b .LBB9_6
-; LE-32BIT-NEXT:  .LBB9_5:
-; LE-32BIT-NEXT:    addi 25, 24, 0
-; LE-32BIT-NEXT:  .LBB9_6:
-; LE-32BIT-NEXT:    lwz 6, 28(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 26, 0, 26
-; LE-32BIT-NEXT:    srw 0, 4, 31
-; LE-32BIT-NEXT:    or 28, 0, 28
-; LE-32BIT-NEXT:    srw 0, 4, 30
-; LE-32BIT-NEXT:    bc 12, 4, .LBB9_8
-; LE-32BIT-NEXT:  # %bb.7:
-; LE-32BIT-NEXT:    ori 9, 6, 0
-; LE-32BIT-NEXT:    b .LBB9_9
-; LE-32BIT-NEXT:  .LBB9_8:
-; LE-32BIT-NEXT:    addi 9, 20, 0
-; LE-32BIT-NEXT:  .LBB9_9:
-; LE-32BIT-NEXT:    or 6, 0, 12
-; LE-32BIT-NEXT:    lwz 12, 52(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    srw 5, 4, 5
-; LE-32BIT-NEXT:    bc 12, 28, .LBB9_11
-; LE-32BIT-NEXT:  # %bb.10:
-; LE-32BIT-NEXT:    ori 0, 17, 0
-; LE-32BIT-NEXT:    b .LBB9_12
-; LE-32BIT-NEXT:  .LBB9_11:
-; LE-32BIT-NEXT:    li 0, 0
-; LE-32BIT-NEXT:  .LBB9_12:
-; LE-32BIT-NEXT:    or 5, 14, 5
-; LE-32BIT-NEXT:    stw 0, 4(18)
-; LE-32BIT-NEXT:    slw 21, 3, 16
-; LE-32BIT-NEXT:    cmplwi 7, 16, 64
-; LE-32BIT-NEXT:    cmplwi 3, 16, 0
-; LE-32BIT-NEXT:    slw 0, 7, 16
-; LE-32BIT-NEXT:    li 16, 0
-; LE-32BIT-NEXT:    bc 12, 4, .LBB9_14
-; LE-32BIT-NEXT:  # %bb.13:
-; LE-32BIT-NEXT:    ori 24, 19, 0
-; LE-32BIT-NEXT:    b .LBB9_15
-; LE-32BIT-NEXT:  .LBB9_14:
-; LE-32BIT-NEXT:    addi 24, 22, 0
-; LE-32BIT-NEXT:  .LBB9_15:
-; LE-32BIT-NEXT:    cmplwi 5, 30, 0
-; LE-32BIT-NEXT:    cmplwi 2, 31, 0
-; LE-32BIT-NEXT:    or 5, 0, 5
-; LE-32BIT-NEXT:    srw 17, 11, 12
-; LE-32BIT-NEXT:    bc 12, 28, .LBB9_17
-; LE-32BIT-NEXT:  # %bb.16:
-; LE-32BIT-NEXT:    ori 0, 16, 0
-; LE-32BIT-NEXT:    b .LBB9_18
-; LE-32BIT-NEXT:  .LBB9_17:
-; LE-32BIT-NEXT:    addi 0, 21, 0
-; LE-32BIT-NEXT:  .LBB9_18:
-; LE-32BIT-NEXT:    lwz 21, 60(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    slw 20, 3, 15
-; LE-32BIT-NEXT:    srw 19, 8, 12
-; LE-32BIT-NEXT:    bc 12, 10, .LBB9_19
-; LE-32BIT-NEXT:    b .LBB9_20
-; LE-32BIT-NEXT:  .LBB9_19:
-; LE-32BIT-NEXT:    addi 25, 3, 0
-; LE-32BIT-NEXT:  .LBB9_20:
-; LE-32BIT-NEXT:    bc 12, 22, .LBB9_22
-; LE-32BIT-NEXT:  # %bb.21:
-; LE-32BIT-NEXT:    ori 12, 24, 0
-; LE-32BIT-NEXT:    b .LBB9_23
-; LE-32BIT-NEXT:  .LBB9_22:
-; LE-32BIT-NEXT:    addi 12, 3, 0
-; LE-32BIT-NEXT:  .LBB9_23:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB9_25
-; LE-32BIT-NEXT:  # %bb.24:
-; LE-32BIT-NEXT:    ori 3, 17, 0
-; LE-32BIT-NEXT:    b .LBB9_26
-; LE-32BIT-NEXT:  .LBB9_25:
-; LE-32BIT-NEXT:    addi 3, 6, 0
-; LE-32BIT-NEXT:  .LBB9_26:
-; LE-32BIT-NEXT:    lwz 6, 48(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    srw 30, 8, 30
-; LE-32BIT-NEXT:    srw 29, 11, 31
-; LE-32BIT-NEXT:    bc 12, 22, .LBB9_27
-; LE-32BIT-NEXT:    b .LBB9_28
-; LE-32BIT-NEXT:  .LBB9_27:
-; LE-32BIT-NEXT:    addi 9, 21, 0
-; LE-32BIT-NEXT:  .LBB9_28:
-; LE-32BIT-NEXT:    mr 22, 4
-; LE-32BIT-NEXT:    lwz 4, 56(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 9, 9, 0
-; LE-32BIT-NEXT:    bc 12, 4, .LBB9_30
-; LE-32BIT-NEXT:  # %bb.29:
-; LE-32BIT-NEXT:    ori 0, 16, 0
-; LE-32BIT-NEXT:    b .LBB9_31
-; LE-32BIT-NEXT:  .LBB9_30:
-; LE-32BIT-NEXT:    addi 0, 30, 0
-; LE-32BIT-NEXT:  .LBB9_31:
-; LE-32BIT-NEXT:    bc 12, 24, .LBB9_33
-; LE-32BIT-NEXT:  # %bb.32:
-; LE-32BIT-NEXT:    ori 30, 16, 0
-; LE-32BIT-NEXT:    b .LBB9_34
-; LE-32BIT-NEXT:  .LBB9_33:
-; LE-32BIT-NEXT:    addi 30, 29, 0
-; LE-32BIT-NEXT:  .LBB9_34:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB9_36
-; LE-32BIT-NEXT:  # %bb.35:
-; LE-32BIT-NEXT:    ori 29, 16, 0
-; LE-32BIT-NEXT:    b .LBB9_37
-; LE-32BIT-NEXT:  .LBB9_36:
-; LE-32BIT-NEXT:    addi 29, 6, 0
-; LE-32BIT-NEXT:  .LBB9_37:
-; LE-32BIT-NEXT:    lwz 6, 44(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    mr 14, 18
-; LE-32BIT-NEXT:    srw 18, 11, 4
-; LE-32BIT-NEXT:    lwz 4, 20(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 24, .LBB9_39
-; LE-32BIT-NEXT:  # %bb.38:
-; LE-32BIT-NEXT:    ori 24, 16, 0
-; LE-32BIT-NEXT:    b .LBB9_40
-; LE-32BIT-NEXT:  .LBB9_39:
-; LE-32BIT-NEXT:    addi 24, 6, 0
-; LE-32BIT-NEXT:  .LBB9_40:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB9_42
-; LE-32BIT-NEXT:  # %bb.41:
-; LE-32BIT-NEXT:    ori 26, 19, 0
-; LE-32BIT-NEXT:    b .LBB9_42
-; LE-32BIT-NEXT:  .LBB9_42:
-; LE-32BIT-NEXT:    bc 12, 22, .LBB9_43
-; LE-32BIT-NEXT:    b .LBB9_44
-; LE-32BIT-NEXT:  .LBB9_43:
-; LE-32BIT-NEXT:    addi 3, 22, 0
-; LE-32BIT-NEXT:  .LBB9_44:
-; LE-32BIT-NEXT:    bc 12, 28, .LBB9_46
-; LE-32BIT-NEXT:  # %bb.45:
-; LE-32BIT-NEXT:    ori 5, 20, 0
-; LE-32BIT-NEXT:    b .LBB9_46
-; LE-32BIT-NEXT:  .LBB9_46:
-; LE-32BIT-NEXT:    bc 12, 0, .LBB9_48
-; LE-32BIT-NEXT:  # %bb.47:
-; LE-32BIT-NEXT:    ori 9, 25, 0
-; LE-32BIT-NEXT:    b .LBB9_48
-; LE-32BIT-NEXT:  .LBB9_48:
-; LE-32BIT-NEXT:    bc 12, 24, .LBB9_50
-; LE-32BIT-NEXT:  # %bb.49:
-; LE-32BIT-NEXT:    ori 28, 18, 0
-; LE-32BIT-NEXT:    b .LBB9_50
-; LE-32BIT-NEXT:  .LBB9_50:
-; LE-32BIT-NEXT:    bc 12, 0, .LBB9_52
-; LE-32BIT-NEXT:  # %bb.51:
-; LE-32BIT-NEXT:    ori 12, 16, 0
-; LE-32BIT-NEXT:    b .LBB9_52
-; LE-32BIT-NEXT:  .LBB9_52:
-; LE-32BIT-NEXT:    lwz 6, 24(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 14, .LBB9_53
-; LE-32BIT-NEXT:    b .LBB9_54
-; LE-32BIT-NEXT:  .LBB9_53:
-; LE-32BIT-NEXT:    addi 5, 7, 0
-; LE-32BIT-NEXT:  .LBB9_54:
-; LE-32BIT-NEXT:    bc 12, 10, .LBB9_55
-; LE-32BIT-NEXT:    b .LBB9_56
-; LE-32BIT-NEXT:  .LBB9_55:
-; LE-32BIT-NEXT:    addi 28, 22, 0
-; LE-32BIT-NEXT:  .LBB9_56:
-; LE-32BIT-NEXT:    bc 12, 28, .LBB9_57
-; LE-32BIT-NEXT:    b .LBB9_58
-; LE-32BIT-NEXT:  .LBB9_57:
-; LE-32BIT-NEXT:    addi 4, 27, 0
-; LE-32BIT-NEXT:  .LBB9_58:
-; LE-32BIT-NEXT:    stw 12, 12(14)
-; LE-32BIT-NEXT:    bc 12, 14, .LBB9_59
-; LE-32BIT-NEXT:    b .LBB9_60
-; LE-32BIT-NEXT:  .LBB9_59:
-; LE-32BIT-NEXT:    addi 4, 11, 0
-; LE-32BIT-NEXT:  .LBB9_60:
-; LE-32BIT-NEXT:    bc 12, 28, .LBB9_62
-; LE-32BIT-NEXT:  # %bb.61:
-; LE-32BIT-NEXT:    ori 27, 16, 0
-; LE-32BIT-NEXT:    b .LBB9_63
-; LE-32BIT-NEXT:  .LBB9_62:
-; LE-32BIT-NEXT:    addi 27, 6, 0
-; LE-32BIT-NEXT:  .LBB9_63:
-; LE-32BIT-NEXT:    bc 12, 22, .LBB9_65
-; LE-32BIT-NEXT:  # %bb.64:
-; LE-32BIT-NEXT:    ori 6, 26, 0
-; LE-32BIT-NEXT:    b .LBB9_66
-; LE-32BIT-NEXT:  .LBB9_65:
-; LE-32BIT-NEXT:    addi 6, 10, 0
-; LE-32BIT-NEXT:  .LBB9_66:
-; LE-32BIT-NEXT:    li 26, 0
-; LE-32BIT-NEXT:    bc 12, 0, .LBB9_68
-; LE-32BIT-NEXT:  # %bb.67:
-; LE-32BIT-NEXT:    ori 3, 26, 0
-; LE-32BIT-NEXT:    b .LBB9_68
-; LE-32BIT-NEXT:  .LBB9_68:
-; LE-32BIT-NEXT:    or 6, 6, 27
-; LE-32BIT-NEXT:    stw 3, 8(14)
-; LE-32BIT-NEXT:    or 3, 0, 4
-; LE-32BIT-NEXT:    bc 12, 22, .LBB9_70
-; LE-32BIT-NEXT:  # %bb.69:
-; LE-32BIT-NEXT:    ori 4, 9, 0
-; LE-32BIT-NEXT:    b .LBB9_71
-; LE-32BIT-NEXT:  .LBB9_70:
-; LE-32BIT-NEXT:    addi 4, 21, 0
-; LE-32BIT-NEXT:  .LBB9_71:
-; LE-32BIT-NEXT:    bc 12, 0, .LBB9_73
-; LE-32BIT-NEXT:  # %bb.72:
-; LE-32BIT-NEXT:    ori 3, 30, 0
-; LE-32BIT-NEXT:    ori 6, 28, 0
-; LE-32BIT-NEXT:    b .LBB9_73
-; LE-32BIT-NEXT:  .LBB9_73:
-; LE-32BIT-NEXT:    stw 4, 28(14)
-; LE-32BIT-NEXT:    or 4, 29, 5
-; LE-32BIT-NEXT:    bc 12, 0, .LBB9_75
-; LE-32BIT-NEXT:  # %bb.74:
-; LE-32BIT-NEXT:    ori 4, 24, 0
-; LE-32BIT-NEXT:    b .LBB9_75
-; LE-32BIT-NEXT:  .LBB9_75:
-; LE-32BIT-NEXT:    bc 12, 22, .LBB9_77
-; LE-32BIT-NEXT:  # %bb.76:
-; LE-32BIT-NEXT:    ori 5, 6, 0
-; LE-32BIT-NEXT:    b .LBB9_78
-; LE-32BIT-NEXT:  .LBB9_77:
-; LE-32BIT-NEXT:    addi 3, 8, 0
-; LE-32BIT-NEXT:    addi 5, 10, 0
-; LE-32BIT-NEXT:  .LBB9_78:
-; LE-32BIT-NEXT:    stw 3, 16(14)
-; LE-32BIT-NEXT:    bc 12, 22, .LBB9_80
-; LE-32BIT-NEXT:  # %bb.79:
-; LE-32BIT-NEXT:    ori 3, 4, 0
-; LE-32BIT-NEXT:    b .LBB9_81
-; LE-32BIT-NEXT:  .LBB9_80:
-; LE-32BIT-NEXT:    addi 3, 23, 0
-; LE-32BIT-NEXT:  .LBB9_81:
-; LE-32BIT-NEXT:    stw 5, 24(14)
-; LE-32BIT-NEXT:    stw 3, 20(14)
-; LE-32BIT-NEXT:    lwz 12, 68(1)
-; LE-32BIT-NEXT:    lwz 31, 140(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    mtcrf 32, 12 # cr2
-; LE-32BIT-NEXT:    mtcrf 16, 12 # cr3
-; LE-32BIT-NEXT:    lwz 30, 136(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 29, 132(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 28, 128(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 27, 124(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 26, 120(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 25, 116(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 24, 112(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 23, 108(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 22, 104(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 21, 100(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 20, 96(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 19, 92(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 18, 88(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 17, 84(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 16, 80(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 15, 76(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 14, 72(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    addi 1, 1, 144
+; LE-32BIT-NEXT:    stwu 1, -112(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    li 6, 0
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 10, 12(3)
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    lwz 12, 20(3)
+; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    lwz 4, 28(4)
+; LE-32BIT-NEXT:    stw 6, 48(1)
+; LE-32BIT-NEXT:    stw 6, 44(1)
+; LE-32BIT-NEXT:    stw 6, 40(1)
+; LE-32BIT-NEXT:    stw 6, 36(1)
+; LE-32BIT-NEXT:    stw 6, 32(1)
+; LE-32BIT-NEXT:    stw 6, 28(1)
+; LE-32BIT-NEXT:    stw 6, 24(1)
+; LE-32BIT-NEXT:    stw 6, 20(1)
+; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 27, 31
+; LE-32BIT-NEXT:    stw 3, 80(1)
+; LE-32BIT-NEXT:    addi 3, 1, 52
+; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    sub 3, 3, 6
+; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 0, 76(1)
+; LE-32BIT-NEXT:    stw 12, 72(1)
+; LE-32BIT-NEXT:    stw 11, 68(1)
+; LE-32BIT-NEXT:    stw 10, 64(1)
+; LE-32BIT-NEXT:    stw 9, 60(1)
+; LE-32BIT-NEXT:    li 9, 7
+; LE-32BIT-NEXT:    stw 8, 56(1)
+; LE-32BIT-NEXT:    nand 9, 4, 9
+; LE-32BIT-NEXT:    stw 7, 52(1)
+; LE-32BIT-NEXT:    clrlwi 4, 4, 29
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    subfic 30, 4, 32
+; LE-32BIT-NEXT:    lwz 7, 8(3)
+; LE-32BIT-NEXT:    clrlwi 9, 9, 27
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    slwi 29, 6, 1
+; LE-32BIT-NEXT:    lwz 10, 16(3)
+; LE-32BIT-NEXT:    srw 28, 7, 4
+; LE-32BIT-NEXT:    lwz 11, 20(3)
+; LE-32BIT-NEXT:    slwi 27, 8, 1
+; LE-32BIT-NEXT:    lwz 12, 24(3)
+; LE-32BIT-NEXT:    srw 26, 10, 4
+; LE-32BIT-NEXT:    lwz 0, 0(3)
+; LE-32BIT-NEXT:    srw 6, 6, 4
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    srw 25, 12, 4
+; LE-32BIT-NEXT:    slw 12, 12, 30
+; LE-32BIT-NEXT:    slw 7, 7, 30
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    slw 10, 10, 30
+; LE-32BIT-NEXT:    slw 30, 0, 30
+; LE-32BIT-NEXT:    srw 8, 8, 4
+; LE-32BIT-NEXT:    srw 0, 0, 4
+; LE-32BIT-NEXT:    srw 4, 11, 4
+; LE-32BIT-NEXT:    or 3, 12, 3
+; LE-32BIT-NEXT:    stw 3, 28(5)
+; LE-32BIT-NEXT:    or 3, 10, 4
+; LE-32BIT-NEXT:    slwi 11, 11, 1
+; LE-32BIT-NEXT:    stw 3, 20(5)
+; LE-32BIT-NEXT:    or 3, 7, 8
+; LE-32BIT-NEXT:    slw 29, 29, 9
+; LE-32BIT-NEXT:    slw 27, 27, 9
+; LE-32BIT-NEXT:    slw 9, 11, 9
+; LE-32BIT-NEXT:    stw 3, 12(5)
+; LE-32BIT-NEXT:    or 3, 30, 6
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    or 3, 25, 9
+; LE-32BIT-NEXT:    stw 3, 24(5)
+; LE-32BIT-NEXT:    or 3, 26, 27
+; LE-32BIT-NEXT:    stw 3, 16(5)
+; LE-32BIT-NEXT:    or 3, 28, 29
+; LE-32BIT-NEXT:    stw 0, 0(5)
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 112
 ; LE-32BIT-NEXT:    blr
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -1182,584 +626,182 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-LABEL: shl_32bytes:
 ; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    li 6, 16
 ; LE-64BIT-NEXT:    lwz 4, 0(4)
-; LE-64BIT-NEXT:    ld 7, 24(3)
-; LE-64BIT-NEXT:    ld 8, 16(3)
-; LE-64BIT-NEXT:    ld 9, 8(3)
-; LE-64BIT-NEXT:    li 6, 0
-; LE-64BIT-NEXT:    ld 3, 0(3)
-; LE-64BIT-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 21, -88(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 24, -64(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    subfic 28, 4, 64
-; LE-64BIT-NEXT:    std 25, -56(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    subfic 11, 4, 192
-; LE-64BIT-NEXT:    addi 0, 4, -128
-; LE-64BIT-NEXT:    subfic 25, 4, 128
-; LE-64BIT-NEXT:    std 27, -40(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 29, -24(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    sld 29, 9, 4
-; LE-64BIT-NEXT:    addi 27, 4, -64
-; LE-64BIT-NEXT:    std 22, -80(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    srd 24, 8, 28
-; LE-64BIT-NEXT:    srd 21, 9, 28
-; LE-64BIT-NEXT:    std 26, -48(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    srd 28, 3, 28
-; LE-64BIT-NEXT:    sld 10, 7, 4
-; LE-64BIT-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    addi 30, 4, -192
+; LE-64BIT-NEXT:    xxlxor 1, 1, 1
+; LE-64BIT-NEXT:    lxvd2x 2, 0, 3
+; LE-64BIT-NEXT:    li 7, 48
+; LE-64BIT-NEXT:    lxvd2x 0, 3, 6
+; LE-64BIT-NEXT:    addi 3, 1, -64
+; LE-64BIT-NEXT:    rlwinm 8, 4, 29, 27, 31
+; LE-64BIT-NEXT:    stxvd2x 1, 3, 6
+; LE-64BIT-NEXT:    li 6, 32
+; LE-64BIT-NEXT:    stxvd2x 0, 3, 7
+; LE-64BIT-NEXT:    neg 7, 8
+; LE-64BIT-NEXT:    addi 8, 1, -32
+; LE-64BIT-NEXT:    stxvd2x 2, 3, 6
+; LE-64BIT-NEXT:    li 6, 7
+; LE-64BIT-NEXT:    stxvd2x 1, 0, 3
+; LE-64BIT-NEXT:    extsw 3, 7
+; LE-64BIT-NEXT:    nand 6, 4, 6
+; LE-64BIT-NEXT:    clrlwi 4, 4, 29
+; LE-64BIT-NEXT:    ldux 3, 8, 3
+; LE-64BIT-NEXT:    clrlwi 6, 6, 26
+; LE-64BIT-NEXT:    subfic 11, 4, 64
+; LE-64BIT-NEXT:    ld 7, 8(8)
+; LE-64BIT-NEXT:    ld 9, 16(8)
+; LE-64BIT-NEXT:    ld 8, 24(8)
+; LE-64BIT-NEXT:    rldicl 10, 7, 63, 1
+; LE-64BIT-NEXT:    sld 8, 8, 4
+; LE-64BIT-NEXT:    srd 6, 10, 6
+; LE-64BIT-NEXT:    sld 10, 9, 4
+; LE-64BIT-NEXT:    srd 9, 9, 11
 ; LE-64BIT-NEXT:    srd 11, 3, 11
-; LE-64BIT-NEXT:    subfic 22, 25, 64
-; LE-64BIT-NEXT:    or 29, 29, 28
-; LE-64BIT-NEXT:    sld 26, 9, 0
-; LE-64BIT-NEXT:    sld 28, 3, 27
-; LE-64BIT-NEXT:    std 23, -72(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    or 10, 10, 24
-; LE-64BIT-NEXT:    ld 24, -64(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    sld 30, 3, 30
-; LE-64BIT-NEXT:    sld 23, 8, 27
-; LE-64BIT-NEXT:    or 11, 26, 11
-; LE-64BIT-NEXT:    or 29, 29, 28
-; LE-64BIT-NEXT:    sld 27, 9, 22
-; LE-64BIT-NEXT:    srd 28, 3, 25
-; LE-64BIT-NEXT:    or 10, 10, 23
-; LE-64BIT-NEXT:    or 11, 11, 30
-; LE-64BIT-NEXT:    ld 26, -48(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    srd 9, 9, 25
-; LE-64BIT-NEXT:    or 30, 28, 27
-; LE-64BIT-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 25, -56(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 23, -72(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    cmplwi 4, 128
-; LE-64BIT-NEXT:    sld 12, 8, 4
-; LE-64BIT-NEXT:    or 9, 10, 9
-; LE-64BIT-NEXT:    or 30, 30, 21
-; LE-64BIT-NEXT:    ld 22, -80(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 21, -88(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    cmplwi 1, 4, 0
-; LE-64BIT-NEXT:    sld 10, 3, 0
-; LE-64BIT-NEXT:    isellt 9, 9, 11
-; LE-64BIT-NEXT:    or 11, 12, 30
-; LE-64BIT-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    isel 7, 7, 9, 6
+; LE-64BIT-NEXT:    or 6, 10, 6
+; LE-64BIT-NEXT:    sld 7, 7, 4
+; LE-64BIT-NEXT:    or 8, 8, 9
+; LE-64BIT-NEXT:    std 6, 16(5)
+; LE-64BIT-NEXT:    or 7, 7, 11
 ; LE-64BIT-NEXT:    sld 3, 3, 4
-; LE-64BIT-NEXT:    isellt 9, 11, 10
-; LE-64BIT-NEXT:    std 7, 24(5)
-; LE-64BIT-NEXT:    isellt 0, 29, 6
-; LE-64BIT-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    isel 4, 8, 9, 6
-; LE-64BIT-NEXT:    std 0, 8(5)
-; LE-64BIT-NEXT:    isellt 3, 3, 6
-; LE-64BIT-NEXT:    std 4, 16(5)
+; LE-64BIT-NEXT:    std 8, 24(5)
 ; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    std 7, 8(5)
 ; LE-64BIT-NEXT:    blr
 ;
 ; BE-LABEL: shl_32bytes:
 ; BE:       # %bb.0:
+; BE-NEXT:    ld 6, 0(3)
+; BE-NEXT:    ld 7, 8(3)
+; BE-NEXT:    ld 8, 16(3)
+; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    ld 7, 0(3)
-; BE-NEXT:    ld 8, 8(3)
-; BE-NEXT:    ld 9, 16(3)
+; BE-NEXT:    addi 9, 1, -64
+; BE-NEXT:    li 10, 0
+; BE-NEXT:    std 10, 56(9)
+; BE-NEXT:    std 10, 48(9)
+; BE-NEXT:    std 10, 40(9)
+; BE-NEXT:    std 10, 32(9)
+; BE-NEXT:    std 3, 24(9)
+; BE-NEXT:    std 8, 16(9)
+; BE-NEXT:    std 7, 8(9)
+; BE-NEXT:    std 6, -64(1)
+; BE-NEXT:    rlwinm 3, 4, 29, 27, 31
+; BE-NEXT:    ldux 6, 3, 9
+; BE-NEXT:    li 7, 7
+; BE-NEXT:    nand 7, 4, 7
+; BE-NEXT:    clrlwi 4, 4, 29
+; BE-NEXT:    clrlwi 7, 7, 26
+; BE-NEXT:    ld 8, 16(3)
+; BE-NEXT:    ld 9, 8(3)
 ; BE-NEXT:    ld 3, 24(3)
-; BE-NEXT:    std 27, -40(1) # 8-byte Folded Spill
-; BE-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; BE-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; BE-NEXT:    std 29, -24(1) # 8-byte Folded Spill
-; BE-NEXT:    li 6, 0
-; BE-NEXT:    subfic 10, 4, 192
-; BE-NEXT:    addi 11, 4, -128
-; BE-NEXT:    addi 12, 4, -192
-; BE-NEXT:    subfic 30, 4, 64
-; BE-NEXT:    srd 10, 3, 10
-; BE-NEXT:    sld 27, 9, 11
-; BE-NEXT:    sld 0, 7, 4
-; BE-NEXT:    addi 29, 4, -64
-; BE-NEXT:    subfic 28, 4, 128
-; BE-NEXT:    sld 12, 3, 12
-; BE-NEXT:    or 10, 27, 10
-; BE-NEXT:    srd 27, 8, 30
-; BE-NEXT:    or 10, 10, 12
-; BE-NEXT:    or 0, 0, 27
-; BE-NEXT:    sld 27, 8, 29
-; BE-NEXT:    subfic 12, 28, 64
-; BE-NEXT:    or 0, 0, 27
-; BE-NEXT:    srd 27, 3, 28
-; BE-NEXT:    sld 12, 9, 12
-; BE-NEXT:    srd 28, 9, 28
-; BE-NEXT:    cmplwi 4, 128
-; BE-NEXT:    or 12, 27, 12
-; BE-NEXT:    or 28, 0, 28
-; BE-NEXT:    srd 0, 9, 30
+; BE-NEXT:    subfic 10, 4, 64
+; BE-NEXT:    sld 6, 6, 4
+; BE-NEXT:    rldicl 11, 8, 63, 1
+; BE-NEXT:    sld 8, 8, 4
+; BE-NEXT:    srd 7, 11, 7
+; BE-NEXT:    srd 11, 9, 10
 ; BE-NEXT:    sld 9, 9, 4
-; BE-NEXT:    sld 11, 3, 11
-; BE-NEXT:    cmplwi 1, 4, 0
-; BE-NEXT:    or 12, 12, 0
-; BE-NEXT:    sld 0, 8, 4
-; BE-NEXT:    bc 12, 0, .LBB10_1
-; BE-NEXT:    b .LBB10_2
-; BE-NEXT:  .LBB10_1:
-; BE-NEXT:    addi 10, 28, 0
-; BE-NEXT:  .LBB10_2:
-; BE-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
-; BE-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
-; BE-NEXT:    or 12, 0, 12
-; BE-NEXT:    srd 0, 3, 30
-; BE-NEXT:    sld 30, 3, 29
-; BE-NEXT:    bc 12, 0, .LBB10_3
-; BE-NEXT:    b .LBB10_4
-; BE-NEXT:  .LBB10_3:
-; BE-NEXT:    addi 11, 12, 0
-; BE-NEXT:  .LBB10_4:
+; BE-NEXT:    srd 10, 3, 10
 ; BE-NEXT:    sld 3, 3, 4
-; BE-NEXT:    bc 12, 6, .LBB10_6
-; BE-NEXT:  # %bb.5:
-; BE-NEXT:    ori 4, 10, 0
-; BE-NEXT:    b .LBB10_7
-; BE-NEXT:  .LBB10_6:
-; BE-NEXT:    addi 4, 7, 0
-; BE-NEXT:  .LBB10_7:
-; BE-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
-; BE-NEXT:    or 9, 9, 0
-; BE-NEXT:    or 9, 9, 30
-; BE-NEXT:    bc 12, 6, .LBB10_9
-; BE-NEXT:  # %bb.8:
-; BE-NEXT:    ori 7, 11, 0
-; BE-NEXT:    b .LBB10_10
-; BE-NEXT:  .LBB10_9:
-; BE-NEXT:    addi 7, 8, 0
-; BE-NEXT:  .LBB10_10:
-; BE-NEXT:    bc 12, 0, .LBB10_12
-; BE-NEXT:  # %bb.11:
-; BE-NEXT:    ori 8, 6, 0
-; BE-NEXT:    ori 3, 6, 0
-; BE-NEXT:    b .LBB10_13
-; BE-NEXT:  .LBB10_12:
-; BE-NEXT:    addi 8, 9, 0
-; BE-NEXT:  .LBB10_13:
-; BE-NEXT:    std 4, 0(5)
-; BE-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
+; BE-NEXT:    or 6, 6, 11
+; BE-NEXT:    or 7, 9, 7
+; BE-NEXT:    or 8, 8, 10
 ; BE-NEXT:    std 3, 24(5)
 ; BE-NEXT:    std 8, 16(5)
+; BE-NEXT:    std 6, 0(5)
 ; BE-NEXT:    std 7, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: shl_32bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -144(1)
-; LE-32BIT-NEXT:    mfcr 12
-; LE-32BIT-NEXT:    stw 14, 72(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 15, 76(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 16, 80(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 17, 84(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 18, 88(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 19, 92(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 20, 96(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 21, 100(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 22, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 23, 108(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 24, 112(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 25, 116(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 26, 120(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 27, 124(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 28, 128(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 29, 132(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 30, 136(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 31, 140(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 12, 68(1)
-; LE-32BIT-NEXT:    lwz 30, 28(4)
-; LE-32BIT-NEXT:    stw 5, 64(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    lwz 6, 24(3)
-; LE-32BIT-NEXT:    subfic 21, 30, 224
-; LE-32BIT-NEXT:    lwz 5, 28(3)
-; LE-32BIT-NEXT:    subfic 29, 30, 160
-; LE-32BIT-NEXT:    lwz 7, 4(3)
-; LE-32BIT-NEXT:    addi 4, 30, -128
-; LE-32BIT-NEXT:    lwz 9, 0(3)
-; LE-32BIT-NEXT:    subfic 28, 30, 96
-; LE-32BIT-NEXT:    lwz 10, 8(3)
-; LE-32BIT-NEXT:    addi 0, 30, -64
-; LE-32BIT-NEXT:    lwz 8, 12(3)
-; LE-32BIT-NEXT:    subfic 25, 30, 32
-; LE-32BIT-NEXT:    lwz 12, 16(3)
-; LE-32BIT-NEXT:    srw 21, 5, 21
-; LE-32BIT-NEXT:    lwz 11, 20(3)
-; LE-32BIT-NEXT:    addi 3, 30, -192
-; LE-32BIT-NEXT:    slw 16, 6, 3
-; LE-32BIT-NEXT:    stw 3, 56(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    slw 20, 9, 30
-; LE-32BIT-NEXT:    srw 15, 11, 29
-; LE-32BIT-NEXT:    slw 14, 12, 4
-; LE-32BIT-NEXT:    srw 31, 8, 28
-; LE-32BIT-NEXT:    slw 3, 10, 0
-; LE-32BIT-NEXT:    or 21, 16, 21
-; LE-32BIT-NEXT:    srw 16, 7, 25
-; LE-32BIT-NEXT:    slw 19, 10, 30
-; LE-32BIT-NEXT:    or 15, 14, 15
-; LE-32BIT-NEXT:    srw 14, 8, 25
-; LE-32BIT-NEXT:    or 3, 3, 31
-; LE-32BIT-NEXT:    srw 31, 5, 29
-; LE-32BIT-NEXT:    or 20, 20, 16
-; LE-32BIT-NEXT:    slw 16, 6, 4
-; LE-32BIT-NEXT:    addi 27, 30, -224
-; LE-32BIT-NEXT:    or 19, 19, 14
-; LE-32BIT-NEXT:    srw 14, 5, 28
-; LE-32BIT-NEXT:    or 16, 16, 31
-; LE-32BIT-NEXT:    slw 31, 6, 0
-; LE-32BIT-NEXT:    addi 23, 30, -160
-; LE-32BIT-NEXT:    slw 18, 12, 30
-; LE-32BIT-NEXT:    stw 0, 52(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 14, 31, 14
-; LE-32BIT-NEXT:    srw 31, 11, 25
-; LE-32BIT-NEXT:    slw 0, 5, 27
-; LE-32BIT-NEXT:    addi 26, 30, -96
-; LE-32BIT-NEXT:    slw 17, 6, 30
-; LE-32BIT-NEXT:    or 18, 18, 31
-; LE-32BIT-NEXT:    srw 31, 5, 25
-; LE-32BIT-NEXT:    or 21, 21, 0
-; LE-32BIT-NEXT:    slw 0, 11, 23
-; LE-32BIT-NEXT:    or 17, 17, 31
-; LE-32BIT-NEXT:    addi 31, 30, -32
-; LE-32BIT-NEXT:    or 0, 15, 0
-; LE-32BIT-NEXT:    slw 15, 8, 26
-; LE-32BIT-NEXT:    stw 29, 40(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 29, 3, 15
-; LE-32BIT-NEXT:    slw 15, 7, 31
-; LE-32BIT-NEXT:    or 20, 20, 15
-; LE-32BIT-NEXT:    slw 15, 8, 31
-; LE-32BIT-NEXT:    or 3, 19, 15
-; LE-32BIT-NEXT:    subfic 15, 30, 128
-; LE-32BIT-NEXT:    slw 23, 5, 23
-; LE-32BIT-NEXT:    stw 3, 48(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 3, 16, 23
-; LE-32BIT-NEXT:    subfic 16, 15, 32
-; LE-32BIT-NEXT:    stw 3, 44(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 3, 11, 15
-; LE-32BIT-NEXT:    slw 22, 12, 16
-; LE-32BIT-NEXT:    or 23, 3, 22
-; LE-32BIT-NEXT:    subfic 22, 30, 64
-; LE-32BIT-NEXT:    stw 9, 60(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    mr 9, 10
-; LE-32BIT-NEXT:    subfic 3, 22, 32
-; LE-32BIT-NEXT:    stw 4, 36(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 4, 8, 22
-; LE-32BIT-NEXT:    slw 24, 9, 3
-; LE-32BIT-NEXT:    or 4, 4, 24
-; LE-32BIT-NEXT:    subfic 24, 30, 192
-; LE-32BIT-NEXT:    subfic 27, 24, 32
-; LE-32BIT-NEXT:    mr 10, 26
-; LE-32BIT-NEXT:    slw 27, 6, 27
-; LE-32BIT-NEXT:    srw 26, 5, 24
-; LE-32BIT-NEXT:    stw 28, 24(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 27, 26, 27
-; LE-32BIT-NEXT:    srw 26, 11, 22
-; LE-32BIT-NEXT:    slw 28, 12, 3
-; LE-32BIT-NEXT:    or 28, 26, 28
-; LE-32BIT-NEXT:    srw 26, 5, 15
-; LE-32BIT-NEXT:    slw 19, 6, 16
-; LE-32BIT-NEXT:    or 26, 26, 19
-; LE-32BIT-NEXT:    slw 19, 5, 10
-; LE-32BIT-NEXT:    stw 7, 32(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    mr 7, 9
-; LE-32BIT-NEXT:    or 19, 14, 19
-; LE-32BIT-NEXT:    slw 14, 11, 31
-; LE-32BIT-NEXT:    lwz 9, 64(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 18, 18, 14
-; LE-32BIT-NEXT:    slw 3, 6, 3
-; LE-32BIT-NEXT:    srw 14, 5, 22
-; LE-32BIT-NEXT:    cmplwi 1, 30, 64
-; LE-32BIT-NEXT:    cmplwi 30, 128
-; LE-32BIT-NEXT:    srw 24, 6, 24
-; LE-32BIT-NEXT:    or 10, 14, 3
-; LE-32BIT-NEXT:    slw 14, 5, 31
-; LE-32BIT-NEXT:    crnand 28, 0, 4
-; LE-32BIT-NEXT:    slw 31, 5, 30
-; LE-32BIT-NEXT:    or 24, 0, 24
-; LE-32BIT-NEXT:    mr 3, 7
-; LE-32BIT-NEXT:    stw 7, 28(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 0, 7, 22
-; LE-32BIT-NEXT:    lwz 7, 24(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 17, 17, 14
-; LE-32BIT-NEXT:    bc 12, 28, .LBB10_2
-; LE-32BIT-NEXT:  # %bb.1:
-; LE-32BIT-NEXT:    ori 14, 31, 0
-; LE-32BIT-NEXT:    b .LBB10_3
-; LE-32BIT-NEXT:  .LBB10_2:
-; LE-32BIT-NEXT:    li 14, 0
-; LE-32BIT-NEXT:  .LBB10_3:
-; LE-32BIT-NEXT:    or 20, 20, 0
-; LE-32BIT-NEXT:    subfic 0, 15, 64
-; LE-32BIT-NEXT:    stw 14, 28(9)
-; LE-32BIT-NEXT:    subfic 14, 0, 32
-; LE-32BIT-NEXT:    srw 14, 11, 14
-; LE-32BIT-NEXT:    slw 31, 12, 0
-; LE-32BIT-NEXT:    or 14, 31, 14
-; LE-32BIT-NEXT:    srw 31, 12, 7
-; LE-32BIT-NEXT:    or 23, 23, 31
-; LE-32BIT-NEXT:    srw 31, 3, 25
-; LE-32BIT-NEXT:    lwz 3, 40(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 4, 4, 31
-; LE-32BIT-NEXT:    slw 0, 11, 0
-; LE-32BIT-NEXT:    cmplwi 3, 15, 0
-; LE-32BIT-NEXT:    srw 31, 6, 3
-; LE-32BIT-NEXT:    or 27, 27, 31
-; LE-32BIT-NEXT:    srw 31, 12, 25
-; LE-32BIT-NEXT:    or 28, 28, 31
-; LE-32BIT-NEXT:    srw 31, 6, 7
-; LE-32BIT-NEXT:    or 26, 26, 31
-; LE-32BIT-NEXT:    srw 31, 6, 22
-; LE-32BIT-NEXT:    or 18, 18, 31
-; LE-32BIT-NEXT:    lwz 31, 36(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    srw 25, 6, 25
-; LE-32BIT-NEXT:    or 3, 10, 25
-; LE-32BIT-NEXT:    or 26, 26, 0
-; LE-32BIT-NEXT:    cmplwi 6, 31, 64
-; LE-32BIT-NEXT:    bc 12, 24, .LBB10_5
-; LE-32BIT-NEXT:  # %bb.4:
-; LE-32BIT-NEXT:    ori 25, 21, 0
-; LE-32BIT-NEXT:    b .LBB10_6
-; LE-32BIT-NEXT:  .LBB10_5:
-; LE-32BIT-NEXT:    addi 25, 24, 0
-; LE-32BIT-NEXT:  .LBB10_6:
-; LE-32BIT-NEXT:    slw 24, 11, 16
-; LE-32BIT-NEXT:    slw 0, 11, 30
-; LE-32BIT-NEXT:    or 24, 14, 24
-; LE-32BIT-NEXT:    lwz 14, 32(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 3, 0, 3
-; LE-32BIT-NEXT:    bc 12, 28, .LBB10_8
-; LE-32BIT-NEXT:  # %bb.7:
-; LE-32BIT-NEXT:    ori 0, 17, 0
-; LE-32BIT-NEXT:    b .LBB10_9
-; LE-32BIT-NEXT:  .LBB10_8:
-; LE-32BIT-NEXT:    li 0, 0
-; LE-32BIT-NEXT:  .LBB10_9:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB10_11
-; LE-32BIT-NEXT:  # %bb.10:
-; LE-32BIT-NEXT:    ori 7, 29, 0
-; LE-32BIT-NEXT:    b .LBB10_12
-; LE-32BIT-NEXT:  .LBB10_11:
-; LE-32BIT-NEXT:    addi 7, 20, 0
-; LE-32BIT-NEXT:  .LBB10_12:
-; LE-32BIT-NEXT:    srw 20, 12, 15
-; LE-32BIT-NEXT:    stw 0, 24(9)
-; LE-32BIT-NEXT:    cmplwi 7, 15, 64
-; LE-32BIT-NEXT:    srw 0, 6, 15
-; LE-32BIT-NEXT:    li 15, 0
-; LE-32BIT-NEXT:    mr 16, 9
-; LE-32BIT-NEXT:    or 24, 0, 24
-; LE-32BIT-NEXT:    lwz 9, 52(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 28, .LBB10_14
-; LE-32BIT-NEXT:  # %bb.13:
-; LE-32BIT-NEXT:    ori 0, 15, 0
-; LE-32BIT-NEXT:    b .LBB10_15
-; LE-32BIT-NEXT:  .LBB10_14:
-; LE-32BIT-NEXT:    addi 0, 20, 0
-; LE-32BIT-NEXT:  .LBB10_15:
-; LE-32BIT-NEXT:    slw 21, 14, 30
-; LE-32BIT-NEXT:    lwz 20, 60(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 4, 21, 4
-; LE-32BIT-NEXT:    slw 21, 11, 31
-; LE-32BIT-NEXT:    cmplwi 5, 30, 0
-; LE-32BIT-NEXT:    or 27, 21, 27
-; LE-32BIT-NEXT:    bc 12, 4, .LBB10_17
-; LE-32BIT-NEXT:  # %bb.16:
-; LE-32BIT-NEXT:    ori 21, 19, 0
-; LE-32BIT-NEXT:    b .LBB10_18
-; LE-32BIT-NEXT:  .LBB10_17:
-; LE-32BIT-NEXT:    addi 21, 18, 0
-; LE-32BIT-NEXT:  .LBB10_18:
-; LE-32BIT-NEXT:    slw 19, 8, 9
-; LE-32BIT-NEXT:    slw 17, 5, 9
-; LE-32BIT-NEXT:    bc 12, 22, .LBB10_20
-; LE-32BIT-NEXT:  # %bb.19:
-; LE-32BIT-NEXT:    ori 9, 7, 0
-; LE-32BIT-NEXT:    b .LBB10_21
-; LE-32BIT-NEXT:  .LBB10_20:
-; LE-32BIT-NEXT:    addi 9, 20, 0
-; LE-32BIT-NEXT:  .LBB10_21:
-; LE-32BIT-NEXT:    lwz 7, 48(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    slw 30, 8, 30
-; LE-32BIT-NEXT:    lwz 10, 56(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    slw 29, 5, 31
-; LE-32BIT-NEXT:    or 9, 9, 0
-; LE-32BIT-NEXT:    bc 12, 4, .LBB10_23
-; LE-32BIT-NEXT:  # %bb.22:
-; LE-32BIT-NEXT:    ori 0, 15, 0
-; LE-32BIT-NEXT:    b .LBB10_24
-; LE-32BIT-NEXT:  .LBB10_23:
-; LE-32BIT-NEXT:    addi 0, 30, 0
-; LE-32BIT-NEXT:  .LBB10_24:
-; LE-32BIT-NEXT:    bc 12, 24, .LBB10_26
-; LE-32BIT-NEXT:  # %bb.25:
-; LE-32BIT-NEXT:    ori 30, 15, 0
-; LE-32BIT-NEXT:    b .LBB10_27
-; LE-32BIT-NEXT:  .LBB10_26:
-; LE-32BIT-NEXT:    addi 30, 29, 0
-; LE-32BIT-NEXT:  .LBB10_27:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB10_29
-; LE-32BIT-NEXT:  # %bb.28:
-; LE-32BIT-NEXT:    ori 29, 15, 0
-; LE-32BIT-NEXT:    b .LBB10_30
-; LE-32BIT-NEXT:  .LBB10_29:
-; LE-32BIT-NEXT:    addi 29, 7, 0
-; LE-32BIT-NEXT:  .LBB10_30:
-; LE-32BIT-NEXT:    lwz 7, 44(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 28, .LBB10_31
-; LE-32BIT-NEXT:    b .LBB10_32
-; LE-32BIT-NEXT:  .LBB10_31:
-; LE-32BIT-NEXT:    addi 28, 26, 0
-; LE-32BIT-NEXT:  .LBB10_32:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB10_34
-; LE-32BIT-NEXT:  # %bb.33:
-; LE-32BIT-NEXT:    ori 3, 17, 0
-; LE-32BIT-NEXT:    b .LBB10_34
-; LE-32BIT-NEXT:  .LBB10_34:
-; LE-32BIT-NEXT:    srw 22, 12, 22
-; LE-32BIT-NEXT:    slw 18, 5, 10
-; LE-32BIT-NEXT:    bc 12, 4, .LBB10_36
-; LE-32BIT-NEXT:  # %bb.35:
-; LE-32BIT-NEXT:    ori 4, 19, 0
-; LE-32BIT-NEXT:    b .LBB10_36
-; LE-32BIT-NEXT:  .LBB10_36:
-; LE-32BIT-NEXT:    bc 12, 14, .LBB10_38
-; LE-32BIT-NEXT:  # %bb.37:
-; LE-32BIT-NEXT:    ori 5, 28, 0
-; LE-32BIT-NEXT:    b .LBB10_38
-; LE-32BIT-NEXT:  .LBB10_38:
-; LE-32BIT-NEXT:    li 28, 0
-; LE-32BIT-NEXT:    bc 12, 22, .LBB10_39
-; LE-32BIT-NEXT:    b .LBB10_40
-; LE-32BIT-NEXT:  .LBB10_39:
-; LE-32BIT-NEXT:    addi 3, 11, 0
-; LE-32BIT-NEXT:  .LBB10_40:
-; LE-32BIT-NEXT:    cmplwi 2, 31, 0
-; LE-32BIT-NEXT:    bc 12, 24, .LBB10_42
-; LE-32BIT-NEXT:  # %bb.41:
-; LE-32BIT-NEXT:    ori 27, 18, 0
-; LE-32BIT-NEXT:    b .LBB10_42
-; LE-32BIT-NEXT:  .LBB10_42:
-; LE-32BIT-NEXT:    bc 12, 28, .LBB10_44
-; LE-32BIT-NEXT:  # %bb.43:
-; LE-32BIT-NEXT:    ori 26, 22, 0
-; LE-32BIT-NEXT:    b .LBB10_45
-; LE-32BIT-NEXT:  .LBB10_44:
-; LE-32BIT-NEXT:    addi 26, 24, 0
-; LE-32BIT-NEXT:  .LBB10_45:
-; LE-32BIT-NEXT:    bc 12, 24, .LBB10_47
-; LE-32BIT-NEXT:  # %bb.46:
-; LE-32BIT-NEXT:    ori 24, 15, 0
-; LE-32BIT-NEXT:    b .LBB10_48
-; LE-32BIT-NEXT:  .LBB10_47:
-; LE-32BIT-NEXT:    addi 24, 7, 0
-; LE-32BIT-NEXT:  .LBB10_48:
-; LE-32BIT-NEXT:    bc 12, 28, .LBB10_50
-; LE-32BIT-NEXT:  # %bb.49:
-; LE-32BIT-NEXT:    ori 7, 15, 0
-; LE-32BIT-NEXT:    b .LBB10_51
-; LE-32BIT-NEXT:  .LBB10_50:
-; LE-32BIT-NEXT:    addi 7, 23, 0
-; LE-32BIT-NEXT:  .LBB10_51:
-; LE-32BIT-NEXT:    bc 12, 22, .LBB10_52
-; LE-32BIT-NEXT:    b .LBB10_53
-; LE-32BIT-NEXT:  .LBB10_52:
-; LE-32BIT-NEXT:    addi 4, 14, 0
-; LE-32BIT-NEXT:  .LBB10_53:
-; LE-32BIT-NEXT:    bc 12, 0, .LBB10_55
-; LE-32BIT-NEXT:  # %bb.54:
-; LE-32BIT-NEXT:    ori 3, 28, 0
-; LE-32BIT-NEXT:    b .LBB10_55
-; LE-32BIT-NEXT:  .LBB10_55:
-; LE-32BIT-NEXT:    bc 12, 10, .LBB10_56
-; LE-32BIT-NEXT:    b .LBB10_57
-; LE-32BIT-NEXT:  .LBB10_56:
-; LE-32BIT-NEXT:    addi 25, 12, 0
-; LE-32BIT-NEXT:  .LBB10_57:
-; LE-32BIT-NEXT:    or 5, 0, 5
-; LE-32BIT-NEXT:    bc 12, 10, .LBB10_58
-; LE-32BIT-NEXT:    b .LBB10_59
-; LE-32BIT-NEXT:  .LBB10_58:
-; LE-32BIT-NEXT:    addi 27, 11, 0
-; LE-32BIT-NEXT:  .LBB10_59:
-; LE-32BIT-NEXT:    stw 3, 20(16)
-; LE-32BIT-NEXT:    or 3, 4, 7
-; LE-32BIT-NEXT:    bc 12, 0, .LBB10_61
-; LE-32BIT-NEXT:  # %bb.60:
-; LE-32BIT-NEXT:    ori 3, 27, 0
-; LE-32BIT-NEXT:    ori 9, 25, 0
-; LE-32BIT-NEXT:    b .LBB10_61
-; LE-32BIT-NEXT:  .LBB10_61:
-; LE-32BIT-NEXT:    bc 12, 14, .LBB10_63
-; LE-32BIT-NEXT:  # %bb.62:
-; LE-32BIT-NEXT:    ori 6, 26, 0
-; LE-32BIT-NEXT:    b .LBB10_63
-; LE-32BIT-NEXT:  .LBB10_63:
-; LE-32BIT-NEXT:    bc 12, 22, .LBB10_65
-; LE-32BIT-NEXT:  # %bb.64:
-; LE-32BIT-NEXT:    ori 12, 21, 0
-; LE-32BIT-NEXT:    b .LBB10_65
-; LE-32BIT-NEXT:  .LBB10_65:
-; LE-32BIT-NEXT:    bc 12, 0, .LBB10_67
-; LE-32BIT-NEXT:  # %bb.66:
-; LE-32BIT-NEXT:    ori 5, 30, 0
-; LE-32BIT-NEXT:    b .LBB10_67
-; LE-32BIT-NEXT:  .LBB10_67:
-; LE-32BIT-NEXT:    bc 12, 22, .LBB10_69
-; LE-32BIT-NEXT:  # %bb.68:
-; LE-32BIT-NEXT:    ori 4, 9, 0
-; LE-32BIT-NEXT:    b .LBB10_70
-; LE-32BIT-NEXT:  .LBB10_69:
-; LE-32BIT-NEXT:    addi 3, 14, 0
-; LE-32BIT-NEXT:    addi 4, 20, 0
-; LE-32BIT-NEXT:  .LBB10_70:
-; LE-32BIT-NEXT:    bc 12, 0, .LBB10_72
-; LE-32BIT-NEXT:  # %bb.71:
-; LE-32BIT-NEXT:    ori 12, 15, 0
-; LE-32BIT-NEXT:    b .LBB10_72
-; LE-32BIT-NEXT:  .LBB10_72:
-; LE-32BIT-NEXT:    bc 12, 22, .LBB10_73
-; LE-32BIT-NEXT:    b .LBB10_74
-; LE-32BIT-NEXT:  .LBB10_73:
-; LE-32BIT-NEXT:    addi 5, 8, 0
-; LE-32BIT-NEXT:  .LBB10_74:
-; LE-32BIT-NEXT:    stw 3, 4(16)
-; LE-32BIT-NEXT:    lwz 3, 28(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    stw 4, 0(16)
-; LE-32BIT-NEXT:    or 4, 29, 6
-; LE-32BIT-NEXT:    bc 12, 0, .LBB10_76
-; LE-32BIT-NEXT:  # %bb.75:
-; LE-32BIT-NEXT:    ori 4, 24, 0
-; LE-32BIT-NEXT:    b .LBB10_76
-; LE-32BIT-NEXT:  .LBB10_76:
-; LE-32BIT-NEXT:    stw 12, 16(16)
-; LE-32BIT-NEXT:    bc 12, 22, .LBB10_78
-; LE-32BIT-NEXT:  # %bb.77:
-; LE-32BIT-NEXT:    ori 3, 4, 0
-; LE-32BIT-NEXT:    b .LBB10_78
-; LE-32BIT-NEXT:  .LBB10_78:
-; LE-32BIT-NEXT:    stw 5, 12(16)
-; LE-32BIT-NEXT:    stw 3, 8(16)
-; LE-32BIT-NEXT:    lwz 12, 68(1)
-; LE-32BIT-NEXT:    lwz 31, 140(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    mtcrf 32, 12 # cr2
-; LE-32BIT-NEXT:    mtcrf 16, 12 # cr3
-; LE-32BIT-NEXT:    lwz 30, 136(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 29, 132(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 28, 128(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 27, 124(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 26, 120(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 25, 116(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 24, 112(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 23, 108(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 22, 104(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 21, 100(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 20, 96(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 19, 92(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 18, 88(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 17, 84(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 16, 80(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 15, 76(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 14, 72(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    addi 1, 1, 144
+; LE-32BIT-NEXT:    stwu 1, -112(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    li 6, 0
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 10, 12(3)
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    lwz 12, 20(3)
+; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    lwz 4, 28(4)
+; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 6, 80(1)
+; LE-32BIT-NEXT:    stw 6, 76(1)
+; LE-32BIT-NEXT:    stw 6, 72(1)
+; LE-32BIT-NEXT:    stw 6, 68(1)
+; LE-32BIT-NEXT:    stw 6, 64(1)
+; LE-32BIT-NEXT:    stw 6, 60(1)
+; LE-32BIT-NEXT:    stw 6, 56(1)
+; LE-32BIT-NEXT:    stw 6, 52(1)
+; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 27, 31
+; LE-32BIT-NEXT:    stw 3, 48(1)
+; LE-32BIT-NEXT:    addi 3, 1, 20
+; LE-32BIT-NEXT:    stw 0, 44(1)
+; LE-32BIT-NEXT:    stw 12, 40(1)
+; LE-32BIT-NEXT:    stw 11, 36(1)
+; LE-32BIT-NEXT:    stw 10, 32(1)
+; LE-32BIT-NEXT:    stw 9, 28(1)
+; LE-32BIT-NEXT:    stw 8, 24(1)
+; LE-32BIT-NEXT:    li 8, 7
+; LE-32BIT-NEXT:    stw 7, 20(1)
+; LE-32BIT-NEXT:    nand 8, 4, 8
+; LE-32BIT-NEXT:    lwzux 3, 6, 3
+; LE-32BIT-NEXT:    clrlwi 4, 4, 29
+; LE-32BIT-NEXT:    subfic 0, 4, 32
+; LE-32BIT-NEXT:    clrlwi 8, 8, 27
+; LE-32BIT-NEXT:    lwz 7, 8(6)
+; LE-32BIT-NEXT:    slw 3, 3, 4
+; LE-32BIT-NEXT:    lwz 9, 4(6)
+; LE-32BIT-NEXT:    lwz 10, 16(6)
+; LE-32BIT-NEXT:    srwi 29, 7, 1
+; LE-32BIT-NEXT:    lwz 11, 12(6)
+; LE-32BIT-NEXT:    slw 28, 9, 4
+; LE-32BIT-NEXT:    lwz 12, 24(6)
+; LE-32BIT-NEXT:    srwi 27, 10, 1
+; LE-32BIT-NEXT:    lwz 30, 20(6)
+; LE-32BIT-NEXT:    slw 26, 11, 4
+; LE-32BIT-NEXT:    lwz 6, 28(6)
+; LE-32BIT-NEXT:    srw 9, 9, 0
+; LE-32BIT-NEXT:    slw 25, 30, 4
+; LE-32BIT-NEXT:    srw 11, 11, 0
+; LE-32BIT-NEXT:    slw 7, 7, 4
+; LE-32BIT-NEXT:    srw 30, 30, 0
+; LE-32BIT-NEXT:    slw 10, 10, 4
+; LE-32BIT-NEXT:    srw 0, 6, 0
+; LE-32BIT-NEXT:    slw 6, 6, 4
+; LE-32BIT-NEXT:    slw 4, 12, 4
+; LE-32BIT-NEXT:    srwi 12, 12, 1
+; LE-32BIT-NEXT:    srw 29, 29, 8
+; LE-32BIT-NEXT:    srw 27, 27, 8
+; LE-32BIT-NEXT:    srw 8, 12, 8
+; LE-32BIT-NEXT:    or 3, 3, 9
+; LE-32BIT-NEXT:    or 4, 4, 0
+; LE-32BIT-NEXT:    stw 3, 0(5)
+; LE-32BIT-NEXT:    or 3, 25, 8
+; LE-32BIT-NEXT:    stw 4, 24(5)
+; LE-32BIT-NEXT:    or 4, 10, 30
+; LE-32BIT-NEXT:    stw 3, 20(5)
+; LE-32BIT-NEXT:    or 3, 26, 27
+; LE-32BIT-NEXT:    stw 4, 16(5)
+; LE-32BIT-NEXT:    or 4, 7, 11
+; LE-32BIT-NEXT:    stw 3, 12(5)
+; LE-32BIT-NEXT:    or 3, 28, 29
+; LE-32BIT-NEXT:    stw 6, 28(5)
+; LE-32BIT-NEXT:    stw 4, 8(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 112
 ; LE-32BIT-NEXT:    blr
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -1770,632 +812,184 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-LABEL: ashr_32bytes:
 ; LE-64BIT:       # %bb.0:
-; LE-64BIT-NEXT:    lwz 4, 0(4)
 ; LE-64BIT-NEXT:    ld 7, 16(3)
 ; LE-64BIT-NEXT:    ld 8, 24(3)
-; LE-64BIT-NEXT:    std 29, -24(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 27, -40(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    subfic 9, 4, 192
-; LE-64BIT-NEXT:    addi 10, 4, -128
-; LE-64BIT-NEXT:    addi 0, 4, -192
-; LE-64BIT-NEXT:    subfic 29, 4, 64
-; LE-64BIT-NEXT:    ld 6, 0(3)
-; LE-64BIT-NEXT:    srd 12, 7, 4
-; LE-64BIT-NEXT:    sld 9, 8, 9
-; LE-64BIT-NEXT:    addi 28, 4, -64
-; LE-64BIT-NEXT:    ld 3, 8(3)
-; LE-64BIT-NEXT:    std 26, -48(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    std 25, -56(1) # 8-byte Folded Spill
-; LE-64BIT-NEXT:    srd 30, 7, 10
-; LE-64BIT-NEXT:    srad 27, 8, 0
-; LE-64BIT-NEXT:    cmpwi 0, 1
-; LE-64BIT-NEXT:    sld 0, 8, 29
-; LE-64BIT-NEXT:    or 9, 30, 9
-; LE-64BIT-NEXT:    subfic 30, 4, 128
-; LE-64BIT-NEXT:    srad 26, 8, 28
-; LE-64BIT-NEXT:    cmpwi 1, 28, 1
-; LE-64BIT-NEXT:    or 12, 12, 0
-; LE-64BIT-NEXT:    subfic 25, 30, 64
-; LE-64BIT-NEXT:    srd 11, 6, 4
-; LE-64BIT-NEXT:    isel 12, 12, 26, 4
-; LE-64BIT-NEXT:    sld 26, 3, 29
-; LE-64BIT-NEXT:    srd 28, 3, 28
-; LE-64BIT-NEXT:    or 11, 11, 26
-; LE-64BIT-NEXT:    sld 29, 7, 29
-; LE-64BIT-NEXT:    srd 26, 7, 25
-; LE-64BIT-NEXT:    sld 7, 7, 30
-; LE-64BIT-NEXT:    or 11, 11, 28
-; LE-64BIT-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 25, -56(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    sld 30, 8, 30
-; LE-64BIT-NEXT:    isellt 9, 9, 27
-; LE-64BIT-NEXT:    or 7, 11, 7
-; LE-64BIT-NEXT:    cmplwi 4, 128
-; LE-64BIT-NEXT:    sradi 27, 8, 63
-; LE-64BIT-NEXT:    or 30, 30, 26
-; LE-64BIT-NEXT:    ld 26, -48(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    srd 0, 3, 4
-; LE-64BIT-NEXT:    isellt 11, 12, 27
-; LE-64BIT-NEXT:    or 12, 30, 29
-; LE-64BIT-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    cmplwi 1, 4, 0
-; LE-64BIT-NEXT:    srad 10, 8, 10
-; LE-64BIT-NEXT:    std 11, 16(5)
-; LE-64BIT-NEXT:    isellt 7, 7, 9
-; LE-64BIT-NEXT:    or 9, 0, 12
-; LE-64BIT-NEXT:    isel 6, 6, 7, 6
-; LE-64BIT-NEXT:    srad 4, 8, 4
-; LE-64BIT-NEXT:    isellt 7, 9, 10
-; LE-64BIT-NEXT:    std 6, 0(5)
-; LE-64BIT-NEXT:    isel 3, 3, 7, 6
-; LE-64BIT-NEXT:    isellt 4, 4, 27
-; LE-64BIT-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
-; LE-64BIT-NEXT:    std 3, 8(5)
+; LE-64BIT-NEXT:    lxvd2x 0, 0, 3
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    addi 6, 1, -64
+; LE-64BIT-NEXT:    sradi 3, 8, 63
+; LE-64BIT-NEXT:    std 8, 24(6)
+; LE-64BIT-NEXT:    std 7, 16(6)
+; LE-64BIT-NEXT:    std 3, 56(6)
+; LE-64BIT-NEXT:    rlwinm 7, 4, 29, 27, 31
+; LE-64BIT-NEXT:    std 3, 48(6)
+; LE-64BIT-NEXT:    std 3, 40(6)
+; LE-64BIT-NEXT:    std 3, 32(6)
+; LE-64BIT-NEXT:    stxvd2x 0, 0, 6
+; LE-64BIT-NEXT:    ldux 3, 7, 6
+; LE-64BIT-NEXT:    li 6, 7
+; LE-64BIT-NEXT:    nand 6, 4, 6
+; LE-64BIT-NEXT:    clrlwi 4, 4, 29
+; LE-64BIT-NEXT:    clrlwi 6, 6, 26
+; LE-64BIT-NEXT:    subfic 11, 4, 64
+; LE-64BIT-NEXT:    ld 8, 16(7)
+; LE-64BIT-NEXT:    ld 9, 8(7)
+; LE-64BIT-NEXT:    ld 7, 24(7)
+; LE-64BIT-NEXT:    srd 3, 3, 4
+; LE-64BIT-NEXT:    sldi 10, 8, 1
+; LE-64BIT-NEXT:    srd 8, 8, 4
+; LE-64BIT-NEXT:    sld 6, 10, 6
+; LE-64BIT-NEXT:    srd 10, 9, 4
+; LE-64BIT-NEXT:    sld 9, 9, 11
+; LE-64BIT-NEXT:    sld 11, 7, 11
+; LE-64BIT-NEXT:    or 6, 10, 6
+; LE-64BIT-NEXT:    or 8, 11, 8
+; LE-64BIT-NEXT:    or 3, 9, 3
+; LE-64BIT-NEXT:    std 6, 8(5)
+; LE-64BIT-NEXT:    srad 4, 7, 4
+; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    std 8, 16(5)
 ; LE-64BIT-NEXT:    std 4, 24(5)
 ; LE-64BIT-NEXT:    blr
 ;
 ; BE-LABEL: ashr_32bytes:
 ; BE:       # %bb.0:
+; BE-NEXT:    ld 6, 0(3)
+; BE-NEXT:    ld 7, 8(3)
+; BE-NEXT:    ld 8, 16(3)
+; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    ld 6, 24(3)
-; BE-NEXT:    ld 7, 16(3)
-; BE-NEXT:    ld 8, 8(3)
-; BE-NEXT:    ld 3, 0(3)
-; BE-NEXT:    std 27, -40(1) # 8-byte Folded Spill
-; BE-NEXT:    std 29, -24(1) # 8-byte Folded Spill
-; BE-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; BE-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; BE-NEXT:    subfic 9, 4, 192
-; BE-NEXT:    addi 10, 4, -128
-; BE-NEXT:    addi 11, 4, -192
-; BE-NEXT:    subfic 0, 4, 64
-; BE-NEXT:    sld 9, 3, 9
-; BE-NEXT:    srd 27, 8, 10
-; BE-NEXT:    srd 12, 6, 4
-; BE-NEXT:    subfic 29, 4, 128
-; BE-NEXT:    cmpwi 11, 1
-; BE-NEXT:    srad 11, 3, 11
-; BE-NEXT:    or 9, 27, 9
-; BE-NEXT:    sld 27, 7, 0
-; BE-NEXT:    addi 30, 4, -64
-; BE-NEXT:    srd 28, 8, 4
-; BE-NEXT:    or 12, 12, 27
-; BE-NEXT:    sld 27, 3, 0
-; BE-NEXT:    bc 12, 0, .LBB11_2
-; BE-NEXT:  # %bb.1:
-; BE-NEXT:    ori 9, 11, 0
-; BE-NEXT:    b .LBB11_2
-; BE-NEXT:  .LBB11_2:
-; BE-NEXT:    subfic 11, 29, 64
-; BE-NEXT:    or 28, 28, 27
-; BE-NEXT:    srd 27, 7, 30
-; BE-NEXT:    sld 0, 8, 0
-; BE-NEXT:    srd 11, 8, 11
-; BE-NEXT:    sld 8, 8, 29
-; BE-NEXT:    sld 29, 3, 29
-; BE-NEXT:    cmplwi 4, 128
-; BE-NEXT:    or 12, 12, 27
-; BE-NEXT:    or 11, 29, 11
-; BE-NEXT:    or 8, 12, 8
-; BE-NEXT:    srd 12, 7, 4
-; BE-NEXT:    or 11, 11, 0
-; BE-NEXT:    cmpwi 1, 30, 1
-; BE-NEXT:    srad 30, 3, 30
-; BE-NEXT:    bc 12, 0, .LBB11_4
-; BE-NEXT:  # %bb.3:
-; BE-NEXT:    ori 8, 9, 0
-; BE-NEXT:    b .LBB11_4
-; BE-NEXT:  .LBB11_4:
-; BE-NEXT:    or 9, 12, 11
-; BE-NEXT:    srad 10, 3, 10
-; BE-NEXT:    bc 12, 4, .LBB11_6
-; BE-NEXT:  # %bb.5:
-; BE-NEXT:    ori 11, 30, 0
-; BE-NEXT:    b .LBB11_7
-; BE-NEXT:  .LBB11_6:
-; BE-NEXT:    addi 11, 28, 0
-; BE-NEXT:  .LBB11_7:
-; BE-NEXT:    cmplwi 1, 4, 0
-; BE-NEXT:    bc 12, 0, .LBB11_9
-; BE-NEXT:  # %bb.8:
-; BE-NEXT:    ori 9, 10, 0
-; BE-NEXT:    b .LBB11_9
-; BE-NEXT:  .LBB11_9:
-; BE-NEXT:    sradi 10, 3, 63
+; BE-NEXT:    addi 9, 1, -64
+; BE-NEXT:    addi 10, 1, -32
+; BE-NEXT:    std 3, 56(9)
+; BE-NEXT:    std 6, 32(9)
+; BE-NEXT:    sradi 3, 6, 63
+; BE-NEXT:    rlwinm 6, 4, 29, 27, 31
+; BE-NEXT:    std 3, 24(9)
+; BE-NEXT:    std 3, 16(9)
+; BE-NEXT:    std 3, 8(9)
+; BE-NEXT:    std 3, -64(1)
+; BE-NEXT:    neg 3, 6
+; BE-NEXT:    std 8, 48(9)
+; BE-NEXT:    std 7, 40(9)
+; BE-NEXT:    extsw 3, 3
+; BE-NEXT:    ldux 3, 10, 3
+; BE-NEXT:    li 6, 7
+; BE-NEXT:    nand 6, 4, 6
+; BE-NEXT:    clrlwi 4, 4, 29
+; BE-NEXT:    clrlwi 6, 6, 26
+; BE-NEXT:    ld 7, 8(10)
+; BE-NEXT:    ld 8, 16(10)
+; BE-NEXT:    ld 9, 24(10)
+; BE-NEXT:    subfic 10, 4, 64
+; BE-NEXT:    sldi 11, 7, 1
+; BE-NEXT:    srd 7, 7, 4
+; BE-NEXT:    srd 9, 9, 4
+; BE-NEXT:    sld 6, 11, 6
+; BE-NEXT:    sld 11, 3, 10
+; BE-NEXT:    sld 10, 8, 10
+; BE-NEXT:    srd 8, 8, 4
 ; BE-NEXT:    srad 3, 3, 4
-; BE-NEXT:    bc 12, 6, .LBB11_11
-; BE-NEXT:  # %bb.10:
-; BE-NEXT:    ori 4, 8, 0
-; BE-NEXT:    b .LBB11_12
-; BE-NEXT:  .LBB11_11:
-; BE-NEXT:    addi 4, 6, 0
-; BE-NEXT:  .LBB11_12:
-; BE-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
-; BE-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
-; BE-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
-; BE-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
-; BE-NEXT:    bc 12, 6, .LBB11_14
-; BE-NEXT:  # %bb.13:
-; BE-NEXT:    ori 6, 9, 0
-; BE-NEXT:    b .LBB11_15
-; BE-NEXT:  .LBB11_14:
-; BE-NEXT:    addi 6, 7, 0
-; BE-NEXT:  .LBB11_15:
-; BE-NEXT:    bc 12, 0, .LBB11_17
-; BE-NEXT:  # %bb.16:
-; BE-NEXT:    ori 7, 10, 0
-; BE-NEXT:    ori 3, 10, 0
-; BE-NEXT:    b .LBB11_18
-; BE-NEXT:  .LBB11_17:
-; BE-NEXT:    addi 7, 11, 0
-; BE-NEXT:  .LBB11_18:
-; BE-NEXT:    std 4, 24(5)
+; BE-NEXT:    or 7, 11, 7
+; BE-NEXT:    or 6, 8, 6
+; BE-NEXT:    or 8, 10, 9
 ; BE-NEXT:    std 3, 0(5)
+; BE-NEXT:    std 8, 24(5)
 ; BE-NEXT:    std 7, 8(5)
 ; BE-NEXT:    std 6, 16(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: ashr_32bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -160(1)
-; LE-32BIT-NEXT:    mfcr 12
-; LE-32BIT-NEXT:    stw 14, 88(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 15, 92(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 16, 96(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 17, 100(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 18, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 19, 108(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 20, 112(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 21, 116(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 22, 120(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 23, 124(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 24, 128(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 25, 132(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 26, 136(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 27, 140(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 28, 144(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 29, 148(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 30, 152(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 31, 156(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 12, 84(1)
-; LE-32BIT-NEXT:    lwz 30, 28(4)
-; LE-32BIT-NEXT:    lwz 10, 4(3)
-; LE-32BIT-NEXT:    lwz 6, 0(3)
-; LE-32BIT-NEXT:    subfic 23, 30, 224
-; LE-32BIT-NEXT:    stw 5, 80(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    addi 21, 30, -224
-; LE-32BIT-NEXT:    lwz 5, 24(3)
-; LE-32BIT-NEXT:    subfic 4, 30, 160
-; LE-32BIT-NEXT:    lwz 8, 28(3)
-; LE-32BIT-NEXT:    addi 0, 30, -128
+; LE-32BIT-NEXT:    stwu 1, -112(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    addi 6, 1, 52
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 10, 12(3)
+; LE-32BIT-NEXT:    lwz 11, 16(3)
 ; LE-32BIT-NEXT:    lwz 12, 20(3)
-; LE-32BIT-NEXT:    subfic 28, 30, 96
-; LE-32BIT-NEXT:    lwz 9, 16(3)
-; LE-32BIT-NEXT:    addi 29, 30, -64
-; LE-32BIT-NEXT:    lwz 27, 12(3)
-; LE-32BIT-NEXT:    subfic 25, 30, 32
-; LE-32BIT-NEXT:    lwz 11, 8(3)
-; LE-32BIT-NEXT:    addi 3, 30, -192
-; LE-32BIT-NEXT:    slw 23, 6, 23
-; LE-32BIT-NEXT:    srw 16, 10, 3
-; LE-32BIT-NEXT:    stw 3, 72(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 20, 8, 30
-; LE-32BIT-NEXT:    sraw 15, 6, 21
-; LE-32BIT-NEXT:    cmpwi 21, 1
-; LE-32BIT-NEXT:    slw 21, 11, 4
-; LE-32BIT-NEXT:    srw 14, 27, 0
-; LE-32BIT-NEXT:    slw 31, 9, 28
-; LE-32BIT-NEXT:    srw 3, 12, 29
-; LE-32BIT-NEXT:    or 23, 16, 23
-; LE-32BIT-NEXT:    slw 16, 5, 25
-; LE-32BIT-NEXT:    srw 19, 12, 30
-; LE-32BIT-NEXT:    or 21, 14, 21
-; LE-32BIT-NEXT:    slw 14, 9, 25
-; LE-32BIT-NEXT:    or 3, 3, 31
-; LE-32BIT-NEXT:    slw 31, 6, 4
-; LE-32BIT-NEXT:    or 20, 20, 16
-; LE-32BIT-NEXT:    srw 16, 10, 0
-; LE-32BIT-NEXT:    or 19, 19, 14
-; LE-32BIT-NEXT:    slw 14, 6, 28
-; LE-32BIT-NEXT:    or 16, 16, 31
-; LE-32BIT-NEXT:    srw 31, 10, 29
-; LE-32BIT-NEXT:    addi 24, 30, -160
-; LE-32BIT-NEXT:    srw 18, 27, 30
-; LE-32BIT-NEXT:    or 14, 31, 14
-; LE-32BIT-NEXT:    slw 31, 11, 25
-; LE-32BIT-NEXT:    addi 7, 30, -96
-; LE-32BIT-NEXT:    srw 17, 10, 30
-; LE-32BIT-NEXT:    stw 4, 48(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 18, 18, 31
-; LE-32BIT-NEXT:    slw 31, 6, 25
-; LE-32BIT-NEXT:    bc 12, 0, .LBB11_2
-; LE-32BIT-NEXT:  # %bb.1:
-; LE-32BIT-NEXT:    ori 4, 15, 0
-; LE-32BIT-NEXT:    b .LBB11_3
-; LE-32BIT-NEXT:  .LBB11_2:
-; LE-32BIT-NEXT:    addi 4, 23, 0
-; LE-32BIT-NEXT:  .LBB11_3:
-; LE-32BIT-NEXT:    srw 15, 11, 24
-; LE-32BIT-NEXT:    or 17, 17, 31
-; LE-32BIT-NEXT:    addi 31, 30, -32
-; LE-32BIT-NEXT:    or 21, 21, 15
-; LE-32BIT-NEXT:    srw 15, 9, 7
-; LE-32BIT-NEXT:    or 3, 3, 15
-; LE-32BIT-NEXT:    srw 15, 5, 31
-; LE-32BIT-NEXT:    or 20, 20, 15
-; LE-32BIT-NEXT:    srw 15, 9, 31
-; LE-32BIT-NEXT:    stw 3, 44(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 3, 19, 15
-; LE-32BIT-NEXT:    subfic 15, 30, 64
-; LE-32BIT-NEXT:    stw 4, 36(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    cmpwi 24, 1
-; LE-32BIT-NEXT:    sraw 24, 6, 24
-; LE-32BIT-NEXT:    subfic 4, 15, 32
-; LE-32BIT-NEXT:    stw 0, 56(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    srw 0, 27, 4
-; LE-32BIT-NEXT:    stw 3, 64(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    bc 12, 0, .LBB11_5
-; LE-32BIT-NEXT:  # %bb.4:
-; LE-32BIT-NEXT:    ori 3, 24, 0
-; LE-32BIT-NEXT:    b .LBB11_6
-; LE-32BIT-NEXT:  .LBB11_5:
-; LE-32BIT-NEXT:    addi 3, 16, 0
-; LE-32BIT-NEXT:  .LBB11_6:
-; LE-32BIT-NEXT:    slw 16, 11, 15
-; LE-32BIT-NEXT:    or 0, 16, 0
-; LE-32BIT-NEXT:    subfic 16, 30, 128
-; LE-32BIT-NEXT:    stw 5, 52(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 5, 16, 32
-; LE-32BIT-NEXT:    stw 3, 60(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    slw 3, 6, 16
-; LE-32BIT-NEXT:    srw 22, 10, 5
-; LE-32BIT-NEXT:    stw 29, 68(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 29, 3, 22
-; LE-32BIT-NEXT:    subfic 3, 30, 192
-; LE-32BIT-NEXT:    stw 8, 76(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    mr 8, 12
-; LE-32BIT-NEXT:    mr 23, 9
-; LE-32BIT-NEXT:    mr 9, 27
-; LE-32BIT-NEXT:    slw 22, 11, 16
-; LE-32BIT-NEXT:    srw 27, 27, 5
-; LE-32BIT-NEXT:    subfic 19, 3, 32
-; LE-32BIT-NEXT:    mr 12, 28
-; LE-32BIT-NEXT:    or 27, 22, 27
-; LE-32BIT-NEXT:    slw 22, 23, 15
-; LE-32BIT-NEXT:    srw 26, 8, 4
-; LE-32BIT-NEXT:    srw 19, 10, 19
-; LE-32BIT-NEXT:    slw 24, 6, 3
-; LE-32BIT-NEXT:    srw 4, 10, 4
-; LE-32BIT-NEXT:    slw 28, 6, 15
-; LE-32BIT-NEXT:    or 26, 22, 26
-; LE-32BIT-NEXT:    cmpwi 7, 1
-; LE-32BIT-NEXT:    sraw 22, 6, 7
-; LE-32BIT-NEXT:    or 24, 24, 19
-; LE-32BIT-NEXT:    srw 19, 11, 31
-; LE-32BIT-NEXT:    mr 7, 11
-; LE-32BIT-NEXT:    or 11, 28, 4
-; LE-32BIT-NEXT:    lwz 4, 80(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 0, .LBB11_7
-; LE-32BIT-NEXT:    b .LBB11_8
-; LE-32BIT-NEXT:  .LBB11_7:
-; LE-32BIT-NEXT:    addi 22, 14, 0
-; LE-32BIT-NEXT:  .LBB11_8:
-; LE-32BIT-NEXT:    cmplwi 1, 30, 64
-; LE-32BIT-NEXT:    cmplwi 30, 128
-; LE-32BIT-NEXT:    slw 3, 10, 3
-; LE-32BIT-NEXT:    or 19, 18, 19
-; LE-32BIT-NEXT:    cmpwi 5, 31, 1
-; LE-32BIT-NEXT:    sraw 18, 6, 31
-; LE-32BIT-NEXT:    crand 28, 0, 4
-; LE-32BIT-NEXT:    srawi 14, 6, 31
-; LE-32BIT-NEXT:    sraw 31, 6, 30
-; LE-32BIT-NEXT:    or 3, 21, 3
-; LE-32BIT-NEXT:    slw 21, 8, 15
-; LE-32BIT-NEXT:    bc 12, 20, .LBB11_10
-; LE-32BIT-NEXT:  # %bb.9:
-; LE-32BIT-NEXT:    ori 28, 18, 0
-; LE-32BIT-NEXT:    b .LBB11_11
-; LE-32BIT-NEXT:  .LBB11_10:
-; LE-32BIT-NEXT:    addi 28, 17, 0
-; LE-32BIT-NEXT:  .LBB11_11:
-; LE-32BIT-NEXT:    bc 12, 28, .LBB11_13
-; LE-32BIT-NEXT:  # %bb.12:
-; LE-32BIT-NEXT:    ori 18, 14, 0
-; LE-32BIT-NEXT:    b .LBB11_14
-; LE-32BIT-NEXT:  .LBB11_13:
-; LE-32BIT-NEXT:    addi 18, 31, 0
-; LE-32BIT-NEXT:  .LBB11_14:
-; LE-32BIT-NEXT:    or 21, 20, 21
-; LE-32BIT-NEXT:    subfic 20, 16, 64
-; LE-32BIT-NEXT:    stw 18, 0(4)
-; LE-32BIT-NEXT:    subfic 18, 20, 32
-; LE-32BIT-NEXT:    slw 18, 7, 18
-; LE-32BIT-NEXT:    srw 17, 9, 20
-; LE-32BIT-NEXT:    or 18, 17, 18
-; LE-32BIT-NEXT:    slw 17, 9, 25
-; LE-32BIT-NEXT:    mr 31, 8
-; LE-32BIT-NEXT:    stw 8, 40(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 8, 0, 17
-; LE-32BIT-NEXT:    slw 0, 10, 12
-; LE-32BIT-NEXT:    stw 8, 28(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    or 8, 29, 0
-; LE-32BIT-NEXT:    slw 0, 9, 12
-; LE-32BIT-NEXT:    or 12, 27, 0
-; LE-32BIT-NEXT:    stw 12, 32(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    slw 0, 31, 25
-; LE-32BIT-NEXT:    lwz 12, 48(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 0, 26, 0
-; LE-32BIT-NEXT:    mr 17, 10
-; LE-32BIT-NEXT:    slw 25, 10, 25
-; LE-32BIT-NEXT:    slw 26, 10, 12
-; LE-32BIT-NEXT:    or 26, 24, 26
-; LE-32BIT-NEXT:    slw 24, 10, 15
-; LE-32BIT-NEXT:    or 24, 19, 24
-; LE-32BIT-NEXT:    lwz 19, 56(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 11, 11, 25
-; LE-32BIT-NEXT:    lwz 10, 36(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    srw 25, 7, 20
-; LE-32BIT-NEXT:    cmplwi 6, 19, 64
-; LE-32BIT-NEXT:    or 8, 8, 25
-; LE-32BIT-NEXT:    bc 12, 24, .LBB11_16
-; LE-32BIT-NEXT:  # %bb.15:
-; LE-32BIT-NEXT:    ori 27, 10, 0
-; LE-32BIT-NEXT:    b .LBB11_17
-; LE-32BIT-NEXT:  .LBB11_16:
-; LE-32BIT-NEXT:    addi 27, 3, 0
-; LE-32BIT-NEXT:  .LBB11_17:
-; LE-32BIT-NEXT:    lwz 10, 52(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    srw 5, 7, 5
-; LE-32BIT-NEXT:    lwz 3, 44(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    or 5, 18, 5
-; LE-32BIT-NEXT:    srw 25, 10, 30
-; LE-32BIT-NEXT:    or 25, 25, 0
-; LE-32BIT-NEXT:    srw 0, 7, 19
-; LE-32BIT-NEXT:    or 26, 0, 26
-; LE-32BIT-NEXT:    srw 0, 7, 30
-; LE-32BIT-NEXT:    bc 12, 4, .LBB11_19
-; LE-32BIT-NEXT:  # %bb.18:
-; LE-32BIT-NEXT:    ori 29, 3, 0
-; LE-32BIT-NEXT:    b .LBB11_20
-; LE-32BIT-NEXT:  .LBB11_19:
-; LE-32BIT-NEXT:    addi 29, 21, 0
-; LE-32BIT-NEXT:  .LBB11_20:
-; LE-32BIT-NEXT:    mr 3, 7
-; LE-32BIT-NEXT:    or 11, 0, 11
-; LE-32BIT-NEXT:    bc 12, 28, .LBB11_22
-; LE-32BIT-NEXT:  # %bb.21:
-; LE-32BIT-NEXT:    ori 0, 14, 0
-; LE-32BIT-NEXT:    b .LBB11_23
-; LE-32BIT-NEXT:  .LBB11_22:
-; LE-32BIT-NEXT:    addi 0, 28, 0
-; LE-32BIT-NEXT:  .LBB11_23:
-; LE-32BIT-NEXT:    lwz 7, 72(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    mr 18, 4
-; LE-32BIT-NEXT:    stw 0, 4(4)
-; LE-32BIT-NEXT:    bc 12, 4, .LBB11_25
-; LE-32BIT-NEXT:  # %bb.24:
-; LE-32BIT-NEXT:    ori 24, 22, 0
-; LE-32BIT-NEXT:    b .LBB11_25
-; LE-32BIT-NEXT:  .LBB11_25:
-; LE-32BIT-NEXT:    cmplwi 5, 30, 0
-; LE-32BIT-NEXT:    lwz 4, 68(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    cmplwi 2, 19, 0
-; LE-32BIT-NEXT:    mr 31, 23
-; LE-32BIT-NEXT:    srw 30, 23, 30
-; LE-32BIT-NEXT:    slw 28, 9, 16
-; LE-32BIT-NEXT:    slw 23, 9, 15
-; LE-32BIT-NEXT:    sraw 21, 6, 7
-; LE-32BIT-NEXT:    bc 12, 10, .LBB11_27
-; LE-32BIT-NEXT:  # %bb.26:
-; LE-32BIT-NEXT:    ori 7, 27, 0
-; LE-32BIT-NEXT:    b .LBB11_28
-; LE-32BIT-NEXT:  .LBB11_27:
-; LE-32BIT-NEXT:    addi 7, 9, 0
-; LE-32BIT-NEXT:  .LBB11_28:
-; LE-32BIT-NEXT:    bc 12, 22, .LBB11_30
-; LE-32BIT-NEXT:  # %bb.29:
-; LE-32BIT-NEXT:    ori 12, 24, 0
-; LE-32BIT-NEXT:    b .LBB11_31
-; LE-32BIT-NEXT:  .LBB11_30:
-; LE-32BIT-NEXT:    addi 12, 9, 0
-; LE-32BIT-NEXT:  .LBB11_31:
-; LE-32BIT-NEXT:    lwz 9, 64(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    srw 22, 31, 4
-; LE-32BIT-NEXT:    sraw 20, 6, 4
-; LE-32BIT-NEXT:    lwz 4, 28(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    cmplwi 7, 16, 64
-; LE-32BIT-NEXT:    cmplwi 3, 16, 0
-; LE-32BIT-NEXT:    slw 0, 17, 16
-; LE-32BIT-NEXT:    lwz 16, 76(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    li 15, 0
-; LE-32BIT-NEXT:    or 5, 0, 5
-; LE-32BIT-NEXT:    bc 12, 28, .LBB11_33
-; LE-32BIT-NEXT:  # %bb.32:
-; LE-32BIT-NEXT:    ori 0, 15, 0
-; LE-32BIT-NEXT:    b .LBB11_34
-; LE-32BIT-NEXT:  .LBB11_33:
-; LE-32BIT-NEXT:    addi 0, 28, 0
-; LE-32BIT-NEXT:  .LBB11_34:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB11_36
-; LE-32BIT-NEXT:  # %bb.35:
-; LE-32BIT-NEXT:    ori 28, 22, 0
-; LE-32BIT-NEXT:    ori 25, 15, 0
-; LE-32BIT-NEXT:    b .LBB11_37
-; LE-32BIT-NEXT:  .LBB11_36:
-; LE-32BIT-NEXT:    addi 28, 25, 0
-; LE-32BIT-NEXT:    addi 25, 9, 0
-; LE-32BIT-NEXT:  .LBB11_37:
-; LE-32BIT-NEXT:    lwz 9, 60(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 28, .LBB11_39
-; LE-32BIT-NEXT:  # %bb.38:
-; LE-32BIT-NEXT:    ori 8, 4, 0
-; LE-32BIT-NEXT:    b .LBB11_39
-; LE-32BIT-NEXT:  .LBB11_39:
-; LE-32BIT-NEXT:    bc 12, 4, .LBB11_41
-; LE-32BIT-NEXT:  # %bb.40:
-; LE-32BIT-NEXT:    ori 4, 20, 0
-; LE-32BIT-NEXT:    b .LBB11_42
-; LE-32BIT-NEXT:  .LBB11_41:
-; LE-32BIT-NEXT:    addi 4, 11, 0
-; LE-32BIT-NEXT:  .LBB11_42:
-; LE-32BIT-NEXT:    bc 12, 22, .LBB11_43
-; LE-32BIT-NEXT:    b .LBB11_44
-; LE-32BIT-NEXT:  .LBB11_43:
-; LE-32BIT-NEXT:    addi 29, 16, 0
-; LE-32BIT-NEXT:  .LBB11_44:
-; LE-32BIT-NEXT:    sraw 19, 6, 19
-; LE-32BIT-NEXT:    bc 12, 22, .LBB11_45
-; LE-32BIT-NEXT:    b .LBB11_46
-; LE-32BIT-NEXT:  .LBB11_45:
-; LE-32BIT-NEXT:    addi 4, 3, 0
-; LE-32BIT-NEXT:  .LBB11_46:
-; LE-32BIT-NEXT:    or 29, 29, 0
-; LE-32BIT-NEXT:    bc 12, 4, .LBB11_48
-; LE-32BIT-NEXT:  # %bb.47:
-; LE-32BIT-NEXT:    ori 0, 15, 0
-; LE-32BIT-NEXT:    b .LBB11_49
-; LE-32BIT-NEXT:  .LBB11_48:
-; LE-32BIT-NEXT:    addi 0, 30, 0
-; LE-32BIT-NEXT:  .LBB11_49:
-; LE-32BIT-NEXT:    bc 12, 14, .LBB11_51
-; LE-32BIT-NEXT:  # %bb.50:
-; LE-32BIT-NEXT:    ori 6, 8, 0
-; LE-32BIT-NEXT:    b .LBB11_51
-; LE-32BIT-NEXT:  .LBB11_51:
-; LE-32BIT-NEXT:    bc 12, 0, .LBB11_53
-; LE-32BIT-NEXT:  # %bb.52:
-; LE-32BIT-NEXT:    ori 4, 14, 0
-; LE-32BIT-NEXT:    b .LBB11_53
-; LE-32BIT-NEXT:  .LBB11_53:
-; LE-32BIT-NEXT:    bc 12, 24, .LBB11_55
-; LE-32BIT-NEXT:  # %bb.54:
-; LE-32BIT-NEXT:    ori 30, 14, 0
-; LE-32BIT-NEXT:    ori 26, 21, 0
-; LE-32BIT-NEXT:    b .LBB11_56
-; LE-32BIT-NEXT:  .LBB11_55:
-; LE-32BIT-NEXT:    addi 30, 19, 0
-; LE-32BIT-NEXT:  .LBB11_56:
-; LE-32BIT-NEXT:    bc 12, 28, .LBB11_58
-; LE-32BIT-NEXT:  # %bb.57:
-; LE-32BIT-NEXT:    ori 5, 23, 0
-; LE-32BIT-NEXT:    b .LBB11_58
-; LE-32BIT-NEXT:  .LBB11_58:
-; LE-32BIT-NEXT:    bc 12, 22, .LBB11_60
-; LE-32BIT-NEXT:  # %bb.59:
-; LE-32BIT-NEXT:    ori 8, 28, 0
-; LE-32BIT-NEXT:    b .LBB11_61
-; LE-32BIT-NEXT:  .LBB11_60:
-; LE-32BIT-NEXT:    addi 8, 10, 0
-; LE-32BIT-NEXT:  .LBB11_61:
-; LE-32BIT-NEXT:    bc 12, 0, .LBB11_63
-; LE-32BIT-NEXT:  # %bb.62:
-; LE-32BIT-NEXT:    ori 12, 14, 0
-; LE-32BIT-NEXT:    b .LBB11_63
-; LE-32BIT-NEXT:  .LBB11_63:
-; LE-32BIT-NEXT:    bc 12, 24, .LBB11_65
-; LE-32BIT-NEXT:  # %bb.64:
-; LE-32BIT-NEXT:    ori 24, 14, 0
-; LE-32BIT-NEXT:    b .LBB11_66
-; LE-32BIT-NEXT:  .LBB11_65:
-; LE-32BIT-NEXT:    addi 24, 9, 0
-; LE-32BIT-NEXT:  .LBB11_66:
-; LE-32BIT-NEXT:    lwz 9, 32(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    bc 12, 10, .LBB11_68
-; LE-32BIT-NEXT:  # %bb.67:
-; LE-32BIT-NEXT:    ori 28, 26, 0
-; LE-32BIT-NEXT:    b .LBB11_69
-; LE-32BIT-NEXT:  .LBB11_68:
-; LE-32BIT-NEXT:    addi 28, 3, 0
-; LE-32BIT-NEXT:  .LBB11_69:
-; LE-32BIT-NEXT:    bc 12, 0, .LBB11_71
-; LE-32BIT-NEXT:  # %bb.70:
-; LE-32BIT-NEXT:    ori 3, 7, 0
-; LE-32BIT-NEXT:    b .LBB11_72
-; LE-32BIT-NEXT:  .LBB11_71:
-; LE-32BIT-NEXT:    addi 3, 29, 0
-; LE-32BIT-NEXT:  .LBB11_72:
-; LE-32BIT-NEXT:    bc 12, 14, .LBB11_73
-; LE-32BIT-NEXT:    b .LBB11_74
-; LE-32BIT-NEXT:  .LBB11_73:
-; LE-32BIT-NEXT:    addi 5, 17, 0
-; LE-32BIT-NEXT:  .LBB11_74:
-; LE-32BIT-NEXT:    stw 4, 8(18)
-; LE-32BIT-NEXT:    or 4, 0, 6
-; LE-32BIT-NEXT:    bc 12, 0, .LBB11_76
-; LE-32BIT-NEXT:  # %bb.75:
-; LE-32BIT-NEXT:    ori 4, 30, 0
-; LE-32BIT-NEXT:    b .LBB11_76
-; LE-32BIT-NEXT:  .LBB11_76:
-; LE-32BIT-NEXT:    bc 12, 28, .LBB11_78
-; LE-32BIT-NEXT:  # %bb.77:
-; LE-32BIT-NEXT:    ori 27, 15, 0
-; LE-32BIT-NEXT:    b .LBB11_79
-; LE-32BIT-NEXT:  .LBB11_78:
-; LE-32BIT-NEXT:    addi 27, 9, 0
-; LE-32BIT-NEXT:  .LBB11_79:
-; LE-32BIT-NEXT:    bc 12, 22, .LBB11_80
-; LE-32BIT-NEXT:    b .LBB11_81
-; LE-32BIT-NEXT:  .LBB11_80:
-; LE-32BIT-NEXT:    addi 3, 16, 0
-; LE-32BIT-NEXT:  .LBB11_81:
-; LE-32BIT-NEXT:    stw 12, 12(18)
-; LE-32BIT-NEXT:    bc 12, 22, .LBB11_82
-; LE-32BIT-NEXT:    b .LBB11_83
-; LE-32BIT-NEXT:  .LBB11_82:
-; LE-32BIT-NEXT:    addi 4, 31, 0
-; LE-32BIT-NEXT:  .LBB11_83:
-; LE-32BIT-NEXT:    or 7, 8, 27
-; LE-32BIT-NEXT:    stw 4, 16(18)
-; LE-32BIT-NEXT:    bc 12, 0, .LBB11_85
-; LE-32BIT-NEXT:  # %bb.84:
-; LE-32BIT-NEXT:    ori 6, 28, 0
-; LE-32BIT-NEXT:    b .LBB11_86
-; LE-32BIT-NEXT:  .LBB11_85:
-; LE-32BIT-NEXT:    addi 6, 7, 0
-; LE-32BIT-NEXT:  .LBB11_86:
-; LE-32BIT-NEXT:    lwz 4, 40(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    stw 3, 28(18)
-; LE-32BIT-NEXT:    or 3, 25, 5
-; LE-32BIT-NEXT:    bc 12, 0, .LBB11_88
-; LE-32BIT-NEXT:  # %bb.87:
-; LE-32BIT-NEXT:    ori 3, 24, 0
-; LE-32BIT-NEXT:    b .LBB11_88
-; LE-32BIT-NEXT:  .LBB11_88:
-; LE-32BIT-NEXT:    bc 12, 22, .LBB11_90
-; LE-32BIT-NEXT:  # %bb.89:
-; LE-32BIT-NEXT:    ori 5, 6, 0
-; LE-32BIT-NEXT:    b .LBB11_91
-; LE-32BIT-NEXT:  .LBB11_90:
-; LE-32BIT-NEXT:    addi 5, 10, 0
-; LE-32BIT-NEXT:    addi 3, 4, 0
-; LE-32BIT-NEXT:  .LBB11_91:
-; LE-32BIT-NEXT:    stw 5, 24(18)
-; LE-32BIT-NEXT:    stw 3, 20(18)
-; LE-32BIT-NEXT:    lwz 12, 84(1)
-; LE-32BIT-NEXT:    lwz 31, 156(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    mtcrf 32, 12 # cr2
-; LE-32BIT-NEXT:    mtcrf 16, 12 # cr3
-; LE-32BIT-NEXT:    lwz 30, 152(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 29, 148(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 28, 144(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 27, 140(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 26, 136(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 25, 132(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 24, 128(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 23, 124(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 22, 120(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 21, 116(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 20, 112(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 19, 108(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 18, 104(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 17, 100(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 16, 96(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 15, 92(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    lwz 14, 88(1) # 4-byte Folded Reload
-; LE-32BIT-NEXT:    addi 1, 1, 160
+; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    lwz 4, 28(4)
+; LE-32BIT-NEXT:    stw 3, 80(1)
+; LE-32BIT-NEXT:    srawi 3, 7, 31
+; LE-32BIT-NEXT:    stw 7, 52(1)
+; LE-32BIT-NEXT:    rlwinm 7, 4, 29, 27, 31
+; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 0, 76(1)
+; LE-32BIT-NEXT:    stw 12, 72(1)
+; LE-32BIT-NEXT:    stw 11, 68(1)
+; LE-32BIT-NEXT:    stw 10, 64(1)
+; LE-32BIT-NEXT:    stw 9, 60(1)
+; LE-32BIT-NEXT:    li 9, 7
+; LE-32BIT-NEXT:    stw 8, 56(1)
+; LE-32BIT-NEXT:    nand 9, 4, 9
+; LE-32BIT-NEXT:    stw 3, 48(1)
+; LE-32BIT-NEXT:    clrlwi 4, 4, 29
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    subfic 30, 4, 32
+; LE-32BIT-NEXT:    stw 3, 40(1)
+; LE-32BIT-NEXT:    clrlwi 9, 9, 27
+; LE-32BIT-NEXT:    stw 3, 36(1)
+; LE-32BIT-NEXT:    stw 3, 32(1)
+; LE-32BIT-NEXT:    stw 3, 28(1)
+; LE-32BIT-NEXT:    stw 3, 24(1)
+; LE-32BIT-NEXT:    stw 3, 20(1)
+; LE-32BIT-NEXT:    sub 3, 6, 7
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 8(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    slwi 29, 6, 1
+; LE-32BIT-NEXT:    lwz 10, 16(3)
+; LE-32BIT-NEXT:    srw 28, 7, 4
+; LE-32BIT-NEXT:    lwz 11, 20(3)
+; LE-32BIT-NEXT:    slwi 27, 8, 1
+; LE-32BIT-NEXT:    lwz 12, 24(3)
+; LE-32BIT-NEXT:    srw 26, 10, 4
+; LE-32BIT-NEXT:    lwz 0, 0(3)
+; LE-32BIT-NEXT:    srw 6, 6, 4
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    srw 25, 12, 4
+; LE-32BIT-NEXT:    slw 12, 12, 30
+; LE-32BIT-NEXT:    slw 7, 7, 30
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    slw 10, 10, 30
+; LE-32BIT-NEXT:    slw 30, 0, 30
+; LE-32BIT-NEXT:    srw 8, 8, 4
+; LE-32BIT-NEXT:    sraw 0, 0, 4
+; LE-32BIT-NEXT:    srw 4, 11, 4
+; LE-32BIT-NEXT:    or 3, 12, 3
+; LE-32BIT-NEXT:    stw 3, 28(5)
+; LE-32BIT-NEXT:    or 3, 10, 4
+; LE-32BIT-NEXT:    slwi 11, 11, 1
+; LE-32BIT-NEXT:    stw 3, 20(5)
+; LE-32BIT-NEXT:    or 3, 7, 8
+; LE-32BIT-NEXT:    slw 29, 29, 9
+; LE-32BIT-NEXT:    slw 27, 27, 9
+; LE-32BIT-NEXT:    slw 9, 11, 9
+; LE-32BIT-NEXT:    stw 3, 12(5)
+; LE-32BIT-NEXT:    or 3, 30, 6
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    or 3, 25, 9
+; LE-32BIT-NEXT:    stw 3, 24(5)
+; LE-32BIT-NEXT:    or 3, 26, 27
+; LE-32BIT-NEXT:    stw 3, 16(5)
+; LE-32BIT-NEXT:    or 3, 28, 29
+; LE-32BIT-NEXT:    stw 0, 0(5)
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 112
 ; LE-32BIT-NEXT:    blr
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1

diff  --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index ddb8305f9e351..2884531585700 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -153,100 +153,122 @@ define i64 @shl64_minsize(i64 %a, i64 %b) minsize nounwind {
 define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: lshr128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -32
 ; RV32I-NEXT:    lw a2, 0(a2)
-; RV32I-NEXT:    lw t5, 4(a1)
-; RV32I-NEXT:    lw t3, 8(a1)
-; RV32I-NEXT:    lw t2, 12(a1)
-; RV32I-NEXT:    neg a3, a2
-; RV32I-NEXT:    li t0, 64
-; RV32I-NEXT:    li a4, 32
-; RV32I-NEXT:    sub a7, a4, a2
-; RV32I-NEXT:    sll a5, t3, a3
-; RV32I-NEXT:    bltz a7, .LBB6_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv t1, a5
-; RV32I-NEXT:    j .LBB6_3
-; RV32I-NEXT:  .LBB6_2:
-; RV32I-NEXT:    sll a3, t2, a3
-; RV32I-NEXT:    sub a4, t0, a2
-; RV32I-NEXT:    not a4, a4
-; RV32I-NEXT:    srli a6, t3, 1
-; RV32I-NEXT:    srl a4, a6, a4
-; RV32I-NEXT:    or t1, a3, a4
-; RV32I-NEXT:  .LBB6_3:
-; RV32I-NEXT:    srl t4, t5, a2
-; RV32I-NEXT:    addi a6, a2, -32
-; RV32I-NEXT:    slti a3, a6, 0
-; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    srl a4, t2, a2
-; RV32I-NEXT:    addi t6, a2, -96
-; RV32I-NEXT:    bltu a2, t0, .LBB6_5
-; RV32I-NEXT:  # %bb.4:
-; RV32I-NEXT:    slti t1, t6, 0
-; RV32I-NEXT:    neg s0, t1
-; RV32I-NEXT:    and s0, s0, a4
-; RV32I-NEXT:    mv t1, t5
-; RV32I-NEXT:    bnez a2, .LBB6_6
-; RV32I-NEXT:    j .LBB6_7
-; RV32I-NEXT:  .LBB6_5:
-; RV32I-NEXT:    and s0, a3, t4
-; RV32I-NEXT:    or s0, s0, t1
-; RV32I-NEXT:    mv t1, t5
-; RV32I-NEXT:    beqz a2, .LBB6_7
-; RV32I-NEXT:  .LBB6_6:
-; RV32I-NEXT:    mv t1, s0
-; RV32I-NEXT:  .LBB6_7:
-; RV32I-NEXT:    lw a1, 0(a1)
-; RV32I-NEXT:    not s0, a2
-; RV32I-NEXT:    bgez a6, .LBB6_9
-; RV32I-NEXT:  # %bb.8:
-; RV32I-NEXT:    srl t4, a1, a2
-; RV32I-NEXT:    slli t5, t5, 1
-; RV32I-NEXT:    sll t5, t5, s0
-; RV32I-NEXT:    or t4, t4, t5
-; RV32I-NEXT:  .LBB6_9:
-; RV32I-NEXT:    srl t3, t3, a2
-; RV32I-NEXT:    slli t2, t2, 1
-; RV32I-NEXT:    sll t2, t2, s0
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    mv t3, t2
-; RV32I-NEXT:    bgez t6, .LBB6_15
-; RV32I-NEXT:  # %bb.10:
-; RV32I-NEXT:    bltu a2, t0, .LBB6_16
-; RV32I-NEXT:  .LBB6_11:
-; RV32I-NEXT:    bnez a2, .LBB6_17
-; RV32I-NEXT:  .LBB6_12:
-; RV32I-NEXT:    bltz a6, .LBB6_14
-; RV32I-NEXT:  .LBB6_13:
-; RV32I-NEXT:    mv t2, a4
-; RV32I-NEXT:  .LBB6_14:
-; RV32I-NEXT:    sltiu a2, a2, 64
-; RV32I-NEXT:    neg a2, a2
-; RV32I-NEXT:    and a5, a2, t2
-; RV32I-NEXT:    and a3, a3, a4
-; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a5, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sb zero, 31(sp)
+; RV32I-NEXT:    sb zero, 30(sp)
+; RV32I-NEXT:    sb zero, 29(sp)
+; RV32I-NEXT:    sb zero, 28(sp)
+; RV32I-NEXT:    sb zero, 27(sp)
+; RV32I-NEXT:    sb zero, 26(sp)
+; RV32I-NEXT:    sb zero, 25(sp)
+; RV32I-NEXT:    sb zero, 24(sp)
+; RV32I-NEXT:    sb zero, 23(sp)
+; RV32I-NEXT:    sb zero, 22(sp)
+; RV32I-NEXT:    sb zero, 21(sp)
+; RV32I-NEXT:    sb zero, 20(sp)
+; RV32I-NEXT:    sb zero, 19(sp)
+; RV32I-NEXT:    sb zero, 18(sp)
+; RV32I-NEXT:    sb zero, 17(sp)
+; RV32I-NEXT:    sb zero, 16(sp)
+; RV32I-NEXT:    sb a1, 12(sp)
+; RV32I-NEXT:    sb a5, 8(sp)
+; RV32I-NEXT:    sb a4, 4(sp)
+; RV32I-NEXT:    sb a3, 0(sp)
+; RV32I-NEXT:    srli a6, a1, 24
+; RV32I-NEXT:    sb a6, 15(sp)
+; RV32I-NEXT:    srli a6, a1, 16
+; RV32I-NEXT:    sb a6, 14(sp)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 13(sp)
+; RV32I-NEXT:    srli a1, a5, 24
+; RV32I-NEXT:    sb a1, 11(sp)
+; RV32I-NEXT:    srli a1, a5, 16
+; RV32I-NEXT:    sb a1, 10(sp)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(sp)
+; RV32I-NEXT:    srli a1, a4, 24
+; RV32I-NEXT:    sb a1, 7(sp)
+; RV32I-NEXT:    srli a1, a4, 16
+; RV32I-NEXT:    sb a1, 6(sp)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 5(sp)
+; RV32I-NEXT:    srli a1, a3, 24
+; RV32I-NEXT:    sb a1, 3(sp)
+; RV32I-NEXT:    srli a1, a3, 16
+; RV32I-NEXT:    sb a1, 2(sp)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(sp)
+; RV32I-NEXT:    slli a1, a2, 25
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    lbu a3, 1(a1)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a6, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    andi a2, a2, 7
+; RV32I-NEXT:    srl a3, a3, a2
+; RV32I-NEXT:    lbu a4, 5(a1)
+; RV32I-NEXT:    lbu a5, 4(a1)
+; RV32I-NEXT:    lbu a6, 6(a1)
+; RV32I-NEXT:    lbu a7, 7(a1)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    slli a5, a4, 1
+; RV32I-NEXT:    xori a6, a2, 31
+; RV32I-NEXT:    sll a5, a5, a6
+; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    srl a4, a4, a2
+; RV32I-NEXT:    lbu a5, 9(a1)
+; RV32I-NEXT:    lbu a7, 8(a1)
+; RV32I-NEXT:    lbu t0, 10(a1)
+; RV32I-NEXT:    lbu t1, 11(a1)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a5, t1, a5
+; RV32I-NEXT:    slli a7, a5, 1
+; RV32I-NEXT:    not t0, a2
+; RV32I-NEXT:    lbu t1, 13(a1)
+; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    or a4, a4, a7
+; RV32I-NEXT:    lbu a7, 12(a1)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    lbu t0, 14(a1)
+; RV32I-NEXT:    lbu a1, 15(a1)
+; RV32I-NEXT:    or a7, t1, a7
+; RV32I-NEXT:    srl a5, a5, a2
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    slli a7, a1, 1
+; RV32I-NEXT:    sll a6, a7, a6
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    sw a5, 8(a0)
-; RV32I-NEXT:    sw a1, 0(a0)
-; RV32I-NEXT:    sw t1, 4(a0)
-; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    sw a4, 4(a0)
+; RV32I-NEXT:    sw a3, 0(a0)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB6_15:
-; RV32I-NEXT:    mv t3, a4
-; RV32I-NEXT:    bgeu a2, t0, .LBB6_11
-; RV32I-NEXT:  .LBB6_16:
-; RV32I-NEXT:    slti a7, a7, 0
-; RV32I-NEXT:    neg a7, a7
-; RV32I-NEXT:    and a5, a7, a5
-; RV32I-NEXT:    or t3, t4, a5
-; RV32I-NEXT:    beqz a2, .LBB6_12
-; RV32I-NEXT:  .LBB6_17:
-; RV32I-NEXT:    mv a1, t3
-; RV32I-NEXT:    bgez a6, .LBB6_13
-; RV32I-NEXT:    j .LBB6_14
 ;
 ; RV64I-LABEL: lshr128:
 ; RV64I:       # %bb.0:
@@ -274,110 +296,126 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: ashr128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -32
 ; RV32I-NEXT:    lw a2, 0(a2)
-; RV32I-NEXT:    lw t3, 8(a1)
-; RV32I-NEXT:    lw t2, 12(a1)
-; RV32I-NEXT:    neg a4, a2
-; RV32I-NEXT:    li a3, 64
-; RV32I-NEXT:    li a5, 32
-; RV32I-NEXT:    sub t1, a5, a2
-; RV32I-NEXT:    sll t0, t3, a4
-; RV32I-NEXT:    bltz t1, .LBB7_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a7, t0
-; RV32I-NEXT:    j .LBB7_3
-; RV32I-NEXT:  .LBB7_2:
-; RV32I-NEXT:    sll a4, t2, a4
-; RV32I-NEXT:    sub a5, a3, a2
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    srli a6, t3, 1
-; RV32I-NEXT:    srl a5, a6, a5
-; RV32I-NEXT:    or a7, a4, a5
-; RV32I-NEXT:  .LBB7_3:
-; RV32I-NEXT:    lw t6, 4(a1)
-; RV32I-NEXT:    sra a4, t2, a2
-; RV32I-NEXT:    addi t4, a2, -96
-; RV32I-NEXT:    srai a5, t2, 31
-; RV32I-NEXT:    mv s0, a4
-; RV32I-NEXT:    bltz t4, .LBB7_5
-; RV32I-NEXT:  # %bb.4:
-; RV32I-NEXT:    mv s0, a5
-; RV32I-NEXT:  .LBB7_5:
-; RV32I-NEXT:    addi a6, a2, -32
-; RV32I-NEXT:    srl t5, t6, a2
-; RV32I-NEXT:    bgeu a2, a3, .LBB7_7
-; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    slti s0, a6, 0
-; RV32I-NEXT:    neg s0, s0
-; RV32I-NEXT:    and s0, s0, t5
-; RV32I-NEXT:    or s0, s0, a7
-; RV32I-NEXT:  .LBB7_7:
-; RV32I-NEXT:    mv a7, t6
-; RV32I-NEXT:    beqz a2, .LBB7_9
-; RV32I-NEXT:  # %bb.8:
-; RV32I-NEXT:    mv a7, s0
-; RV32I-NEXT:  .LBB7_9:
+; RV32I-NEXT:    lw a3, 12(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a5, 4(a1)
 ; RV32I-NEXT:    lw a1, 0(a1)
-; RV32I-NEXT:    not s0, a2
-; RV32I-NEXT:    bgez a6, .LBB7_11
-; RV32I-NEXT:  # %bb.10:
-; RV32I-NEXT:    srl t5, a1, a2
-; RV32I-NEXT:    slli t6, t6, 1
-; RV32I-NEXT:    sll t6, t6, s0
-; RV32I-NEXT:    or t5, t5, t6
-; RV32I-NEXT:  .LBB7_11:
-; RV32I-NEXT:    srl t3, t3, a2
-; RV32I-NEXT:    slli t2, t2, 1
-; RV32I-NEXT:    sll t2, t2, s0
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    mv t3, t2
-; RV32I-NEXT:    bgez t4, .LBB7_20
-; RV32I-NEXT:  # %bb.12:
-; RV32I-NEXT:    bltu a2, a3, .LBB7_21
-; RV32I-NEXT:  .LBB7_13:
-; RV32I-NEXT:    bnez a2, .LBB7_22
-; RV32I-NEXT:  .LBB7_14:
-; RV32I-NEXT:    bgez a6, .LBB7_23
-; RV32I-NEXT:  .LBB7_15:
-; RV32I-NEXT:    bgeu a2, a3, .LBB7_24
-; RV32I-NEXT:  .LBB7_16:
-; RV32I-NEXT:    bgez a6, .LBB7_25
-; RV32I-NEXT:  .LBB7_17:
-; RV32I-NEXT:    bltu a2, a3, .LBB7_19
-; RV32I-NEXT:  .LBB7_18:
-; RV32I-NEXT:    mv a4, a5
-; RV32I-NEXT:  .LBB7_19:
-; RV32I-NEXT:    sw a4, 12(a0)
-; RV32I-NEXT:    sw t2, 8(a0)
-; RV32I-NEXT:    sw a1, 0(a0)
-; RV32I-NEXT:    sw a7, 4(a0)
-; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    sb a3, 12(sp)
+; RV32I-NEXT:    sb a4, 8(sp)
+; RV32I-NEXT:    sb a5, 4(sp)
+; RV32I-NEXT:    sb a1, 0(sp)
+; RV32I-NEXT:    srai a6, a3, 31
+; RV32I-NEXT:    sb a6, 28(sp)
+; RV32I-NEXT:    sb a6, 24(sp)
+; RV32I-NEXT:    sb a6, 20(sp)
+; RV32I-NEXT:    sb a6, 16(sp)
+; RV32I-NEXT:    srli a7, a3, 24
+; RV32I-NEXT:    sb a7, 15(sp)
+; RV32I-NEXT:    srli a7, a3, 16
+; RV32I-NEXT:    sb a7, 14(sp)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(sp)
+; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    sb a3, 11(sp)
+; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    sb a3, 10(sp)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 9(sp)
+; RV32I-NEXT:    srli a3, a5, 24
+; RV32I-NEXT:    sb a3, 7(sp)
+; RV32I-NEXT:    srli a3, a5, 16
+; RV32I-NEXT:    sb a3, 6(sp)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 5(sp)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(sp)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(sp)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(sp)
+; RV32I-NEXT:    srli a1, a6, 24
+; RV32I-NEXT:    sb a1, 31(sp)
+; RV32I-NEXT:    srli a3, a6, 16
+; RV32I-NEXT:    sb a3, 30(sp)
+; RV32I-NEXT:    srli a4, a6, 8
+; RV32I-NEXT:    sb a4, 29(sp)
+; RV32I-NEXT:    sb a1, 27(sp)
+; RV32I-NEXT:    sb a3, 26(sp)
+; RV32I-NEXT:    sb a4, 25(sp)
+; RV32I-NEXT:    sb a1, 23(sp)
+; RV32I-NEXT:    sb a3, 22(sp)
+; RV32I-NEXT:    sb a4, 21(sp)
+; RV32I-NEXT:    sb a1, 19(sp)
+; RV32I-NEXT:    sb a3, 18(sp)
+; RV32I-NEXT:    sb a4, 17(sp)
+; RV32I-NEXT:    slli a1, a2, 25
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    lbu a3, 1(a1)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a6, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    andi a2, a2, 7
+; RV32I-NEXT:    srl a3, a3, a2
+; RV32I-NEXT:    lbu a4, 5(a1)
+; RV32I-NEXT:    lbu a5, 4(a1)
+; RV32I-NEXT:    lbu a6, 6(a1)
+; RV32I-NEXT:    lbu a7, 7(a1)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    slli a5, a4, 1
+; RV32I-NEXT:    xori a6, a2, 31
+; RV32I-NEXT:    sll a5, a5, a6
+; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    srl a4, a4, a2
+; RV32I-NEXT:    lbu a5, 9(a1)
+; RV32I-NEXT:    lbu a7, 8(a1)
+; RV32I-NEXT:    lbu t0, 10(a1)
+; RV32I-NEXT:    lbu t1, 11(a1)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a5, t1, a5
+; RV32I-NEXT:    slli a7, a5, 1
+; RV32I-NEXT:    not t0, a2
+; RV32I-NEXT:    lbu t1, 13(a1)
+; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    or a4, a4, a7
+; RV32I-NEXT:    lbu a7, 12(a1)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    lbu t0, 14(a1)
+; RV32I-NEXT:    lbu a1, 15(a1)
+; RV32I-NEXT:    or a7, t1, a7
+; RV32I-NEXT:    srl a5, a5, a2
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    slli a7, a1, 1
+; RV32I-NEXT:    sll a6, a7, a6
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    sra a1, a1, a2
+; RV32I-NEXT:    sw a1, 12(a0)
+; RV32I-NEXT:    sw a5, 8(a0)
+; RV32I-NEXT:    sw a4, 4(a0)
+; RV32I-NEXT:    sw a3, 0(a0)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB7_20:
-; RV32I-NEXT:    mv t3, a4
-; RV32I-NEXT:    bgeu a2, a3, .LBB7_13
-; RV32I-NEXT:  .LBB7_21:
-; RV32I-NEXT:    slti t1, t1, 0
-; RV32I-NEXT:    neg t1, t1
-; RV32I-NEXT:    and t0, t1, t0
-; RV32I-NEXT:    or t3, t5, t0
-; RV32I-NEXT:    beqz a2, .LBB7_14
-; RV32I-NEXT:  .LBB7_22:
-; RV32I-NEXT:    mv a1, t3
-; RV32I-NEXT:    bltz a6, .LBB7_15
-; RV32I-NEXT:  .LBB7_23:
-; RV32I-NEXT:    mv t2, a4
-; RV32I-NEXT:    bltu a2, a3, .LBB7_16
-; RV32I-NEXT:  .LBB7_24:
-; RV32I-NEXT:    mv t2, a5
-; RV32I-NEXT:    bltz a6, .LBB7_17
-; RV32I-NEXT:  .LBB7_25:
-; RV32I-NEXT:    mv a4, a5
-; RV32I-NEXT:    bgeu a2, a3, .LBB7_18
-; RV32I-NEXT:    j .LBB7_19
 ;
 ; RV64I-LABEL: ashr128:
 ; RV64I:       # %bb.0:
@@ -404,99 +442,122 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: shl128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -32
 ; RV32I-NEXT:    lw a2, 0(a2)
-; RV32I-NEXT:    lw t5, 8(a1)
-; RV32I-NEXT:    lw t3, 4(a1)
-; RV32I-NEXT:    lw t2, 0(a1)
-; RV32I-NEXT:    neg a3, a2
-; RV32I-NEXT:    li a7, 64
-; RV32I-NEXT:    li a4, 32
-; RV32I-NEXT:    sub a6, a4, a2
-; RV32I-NEXT:    srl a5, t3, a3
-; RV32I-NEXT:    bltz a6, .LBB8_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv t1, a5
-; RV32I-NEXT:    j .LBB8_3
-; RV32I-NEXT:  .LBB8_2:
-; RV32I-NEXT:    srl a3, t2, a3
-; RV32I-NEXT:    sub a4, a7, a2
-; RV32I-NEXT:    not a4, a4
-; RV32I-NEXT:    slli t0, t3, 1
-; RV32I-NEXT:    sll a4, t0, a4
-; RV32I-NEXT:    or t1, a3, a4
-; RV32I-NEXT:  .LBB8_3:
-; RV32I-NEXT:    sll t4, t5, a2
-; RV32I-NEXT:    addi a3, a2, -32
-; RV32I-NEXT:    slti a4, a3, 0
-; RV32I-NEXT:    neg t0, a4
-; RV32I-NEXT:    sll a4, t2, a2
-; RV32I-NEXT:    addi t6, a2, -96
-; RV32I-NEXT:    bltu a2, a7, .LBB8_5
-; RV32I-NEXT:  # %bb.4:
-; RV32I-NEXT:    slti t1, t6, 0
-; RV32I-NEXT:    neg s0, t1
-; RV32I-NEXT:    and s0, s0, a4
-; RV32I-NEXT:    mv t1, t5
-; RV32I-NEXT:    bnez a2, .LBB8_6
-; RV32I-NEXT:    j .LBB8_7
-; RV32I-NEXT:  .LBB8_5:
-; RV32I-NEXT:    and s0, t0, t4
-; RV32I-NEXT:    or s0, s0, t1
-; RV32I-NEXT:    mv t1, t5
-; RV32I-NEXT:    beqz a2, .LBB8_7
-; RV32I-NEXT:  .LBB8_6:
-; RV32I-NEXT:    mv t1, s0
-; RV32I-NEXT:  .LBB8_7:
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    not s0, a2
-; RV32I-NEXT:    bgez a3, .LBB8_9
-; RV32I-NEXT:  # %bb.8:
-; RV32I-NEXT:    sll t4, a1, a2
-; RV32I-NEXT:    srli t5, t5, 1
-; RV32I-NEXT:    srl t5, t5, s0
-; RV32I-NEXT:    or t4, t4, t5
-; RV32I-NEXT:  .LBB8_9:
-; RV32I-NEXT:    sll t3, t3, a2
-; RV32I-NEXT:    srli t2, t2, 1
-; RV32I-NEXT:    srl t2, t2, s0
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    mv t3, t2
-; RV32I-NEXT:    bgez t6, .LBB8_16
-; RV32I-NEXT:  # %bb.10:
-; RV32I-NEXT:    bltu a2, a7, .LBB8_17
-; RV32I-NEXT:  .LBB8_11:
-; RV32I-NEXT:    beqz a2, .LBB8_13
-; RV32I-NEXT:  .LBB8_12:
-; RV32I-NEXT:    mv a1, t3
-; RV32I-NEXT:  .LBB8_13:
-; RV32I-NEXT:    and a6, t0, a4
-; RV32I-NEXT:    sltiu a2, a2, 64
-; RV32I-NEXT:    neg a5, a2
-; RV32I-NEXT:    and a2, a5, a6
-; RV32I-NEXT:    bltz a3, .LBB8_15
-; RV32I-NEXT:  # %bb.14:
-; RV32I-NEXT:    mv t2, a4
-; RV32I-NEXT:  .LBB8_15:
-; RV32I-NEXT:    and a3, a5, t2
-; RV32I-NEXT:    sw a3, 4(a0)
+; RV32I-NEXT:    sb zero, 15(sp)
+; RV32I-NEXT:    sb zero, 14(sp)
+; RV32I-NEXT:    sb zero, 13(sp)
+; RV32I-NEXT:    sb zero, 12(sp)
+; RV32I-NEXT:    sb zero, 11(sp)
+; RV32I-NEXT:    sb zero, 10(sp)
+; RV32I-NEXT:    sb zero, 9(sp)
+; RV32I-NEXT:    sb zero, 8(sp)
+; RV32I-NEXT:    sb zero, 7(sp)
+; RV32I-NEXT:    sb zero, 6(sp)
+; RV32I-NEXT:    sb zero, 5(sp)
+; RV32I-NEXT:    sb zero, 4(sp)
+; RV32I-NEXT:    sb zero, 3(sp)
+; RV32I-NEXT:    sb zero, 2(sp)
+; RV32I-NEXT:    sb zero, 1(sp)
+; RV32I-NEXT:    sb zero, 0(sp)
+; RV32I-NEXT:    sb a1, 28(sp)
+; RV32I-NEXT:    sb a5, 24(sp)
+; RV32I-NEXT:    sb a4, 20(sp)
+; RV32I-NEXT:    sb a3, 16(sp)
+; RV32I-NEXT:    srli a6, a1, 24
+; RV32I-NEXT:    sb a6, 31(sp)
+; RV32I-NEXT:    srli a6, a1, 16
+; RV32I-NEXT:    sb a6, 30(sp)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 29(sp)
+; RV32I-NEXT:    srli a1, a5, 24
+; RV32I-NEXT:    sb a1, 27(sp)
+; RV32I-NEXT:    srli a1, a5, 16
+; RV32I-NEXT:    sb a1, 26(sp)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 25(sp)
+; RV32I-NEXT:    srli a1, a4, 24
+; RV32I-NEXT:    sb a1, 23(sp)
+; RV32I-NEXT:    srli a1, a4, 16
+; RV32I-NEXT:    sb a1, 22(sp)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 21(sp)
+; RV32I-NEXT:    srli a1, a3, 24
+; RV32I-NEXT:    sb a1, 19(sp)
+; RV32I-NEXT:    srli a1, a3, 16
+; RV32I-NEXT:    sb a1, 18(sp)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 17(sp)
+; RV32I-NEXT:    slli a1, a2, 25
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    addi a3, sp, 16
+; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    lbu a1, 5(a3)
+; RV32I-NEXT:    lbu a4, 4(a3)
+; RV32I-NEXT:    lbu a5, 6(a3)
+; RV32I-NEXT:    lbu a6, 7(a3)
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a1, a5, a1
+; RV32I-NEXT:    or a1, a6, a1
+; RV32I-NEXT:    andi a2, a2, 7
+; RV32I-NEXT:    sll a4, a1, a2
+; RV32I-NEXT:    lbu a5, 1(a3)
+; RV32I-NEXT:    lbu a6, 0(a3)
+; RV32I-NEXT:    lbu a7, 2(a3)
+; RV32I-NEXT:    lbu t0, 3(a3)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    srli a6, a5, 1
+; RV32I-NEXT:    xori a7, a2, 31
+; RV32I-NEXT:    srl a6, a6, a7
+; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    lbu a6, 9(a3)
+; RV32I-NEXT:    lbu t0, 8(a3)
+; RV32I-NEXT:    lbu t1, 10(a3)
+; RV32I-NEXT:    lbu t2, 11(a3)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    sll t0, a6, a2
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    not t1, a2
+; RV32I-NEXT:    srl a1, a1, t1
+; RV32I-NEXT:    or a1, t0, a1
+; RV32I-NEXT:    lbu t0, 13(a3)
+; RV32I-NEXT:    lbu t1, 12(a3)
+; RV32I-NEXT:    lbu t2, 14(a3)
+; RV32I-NEXT:    lbu a3, 15(a3)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or t0, t2, t0
+; RV32I-NEXT:    or a3, a3, t0
+; RV32I-NEXT:    sll a3, a3, a2
+; RV32I-NEXT:    srli a6, a6, 1
+; RV32I-NEXT:    srl a6, a6, a7
+; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    sll a2, a5, a2
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a1, 12(a0)
-; RV32I-NEXT:    sw t1, 8(a0)
-; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a4, 4(a0)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB8_16:
-; RV32I-NEXT:    mv t3, a4
-; RV32I-NEXT:    bgeu a2, a7, .LBB8_11
-; RV32I-NEXT:  .LBB8_17:
-; RV32I-NEXT:    slti a6, a6, 0
-; RV32I-NEXT:    neg a6, a6
-; RV32I-NEXT:    and a5, a6, a5
-; RV32I-NEXT:    or t3, t4, a5
-; RV32I-NEXT:    bnez a2, .LBB8_12
-; RV32I-NEXT:    j .LBB8_13
 ;
 ; RV64I-LABEL: shl128:
 ; RV64I:       # %bb.0:

diff  --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index a10a5ed010cac..3c8bf9cf01c8d 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -726,168 +726,98 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    lbu a4, 4(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 7(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    addi sp, sp, -48
+; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 2(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sb zero, 35(sp)
+; RV32I-NEXT:    sb zero, 34(sp)
+; RV32I-NEXT:    sb zero, 33(sp)
+; RV32I-NEXT:    sb zero, 32(sp)
+; RV32I-NEXT:    sb zero, 31(sp)
+; RV32I-NEXT:    sb zero, 30(sp)
+; RV32I-NEXT:    sb zero, 29(sp)
+; RV32I-NEXT:    sb zero, 28(sp)
+; RV32I-NEXT:    sb zero, 27(sp)
+; RV32I-NEXT:    sb zero, 26(sp)
+; RV32I-NEXT:    sb zero, 25(sp)
+; RV32I-NEXT:    sb zero, 24(sp)
+; RV32I-NEXT:    sb zero, 23(sp)
+; RV32I-NEXT:    sb zero, 22(sp)
+; RV32I-NEXT:    sb zero, 21(sp)
+; RV32I-NEXT:    sb zero, 20(sp)
+; RV32I-NEXT:    sb a0, 19(sp)
+; RV32I-NEXT:    sb s2, 18(sp)
+; RV32I-NEXT:    sb s1, 17(sp)
+; RV32I-NEXT:    sb s0, 16(sp)
+; RV32I-NEXT:    sb t6, 15(sp)
+; RV32I-NEXT:    sb t5, 14(sp)
+; RV32I-NEXT:    sb t4, 13(sp)
+; RV32I-NEXT:    sb t3, 12(sp)
+; RV32I-NEXT:    sb t2, 11(sp)
+; RV32I-NEXT:    sb t1, 10(sp)
+; RV32I-NEXT:    sb t0, 9(sp)
+; RV32I-NEXT:    sb a7, 8(sp)
+; RV32I-NEXT:    sb a6, 7(sp)
+; RV32I-NEXT:    sb a5, 6(sp)
+; RV32I-NEXT:    sb a4, 5(sp)
+; RV32I-NEXT:    sb a3, 4(sp)
+; RV32I-NEXT:    andi a1, a1, 15
+; RV32I-NEXT:    addi a0, sp, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    lbu a4, 7(a0)
+; RV32I-NEXT:    lbu a5, 6(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
+; RV32I-NEXT:    lbu a7, 0(a0)
 ; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a7, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or t2, t0, a6
-; RV32I-NEXT:    lbu a4, 13(a0)
-; RV32I-NEXT:    lbu a5, 12(a0)
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t0, 15(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a4, a6, a4
-; RV32I-NEXT:    or a6, t0, a4
-; RV32I-NEXT:    lbu a4, 9(a0)
-; RV32I-NEXT:    lbu a5, 8(a0)
-; RV32I-NEXT:    lbu t0, 10(a0)
-; RV32I-NEXT:    lbu a0, 11(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a4, t0, a4
-; RV32I-NEXT:    or t0, a0, a4
-; RV32I-NEXT:    lbu a0, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a0, a5, a0
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    slli a1, a0, 3
-; RV32I-NEXT:    srl a0, t0, a1
-; RV32I-NEXT:    slli a4, a6, 1
-; RV32I-NEXT:    not t4, a1
-; RV32I-NEXT:    sll a4, a4, t4
-; RV32I-NEXT:    or a4, a0, a4
-; RV32I-NEXT:    addi t1, a1, -96
-; RV32I-NEXT:    srl a5, a6, a1
-; RV32I-NEXT:    mv t3, a4
-; RV32I-NEXT:    bltz t1, .LBB6_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv t3, a5
-; RV32I-NEXT:  .LBB6_2:
-; RV32I-NEXT:    or a0, t2, a7
-; RV32I-NEXT:    addi a7, a1, -32
-; RV32I-NEXT:    srl t2, a3, a1
-; RV32I-NEXT:    bltz a7, .LBB6_4
-; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv s1, t2
-; RV32I-NEXT:    j .LBB6_5
-; RV32I-NEXT:  .LBB6_4:
-; RV32I-NEXT:    srl t5, a0, a1
-; RV32I-NEXT:    slli t6, a3, 1
-; RV32I-NEXT:    sll t4, t6, t4
-; RV32I-NEXT:    or s1, t5, t4
-; RV32I-NEXT:  .LBB6_5:
-; RV32I-NEXT:    neg t6, a1
-; RV32I-NEXT:    sll t4, t0, t6
-; RV32I-NEXT:    li s0, 32
-; RV32I-NEXT:    li t5, 64
-; RV32I-NEXT:    sub s0, s0, a1
-; RV32I-NEXT:    bltu a1, t5, .LBB6_11
-; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    bnez a1, .LBB6_12
-; RV32I-NEXT:  .LBB6_7:
-; RV32I-NEXT:    bgez s0, .LBB6_9
-; RV32I-NEXT:  .LBB6_8:
-; RV32I-NEXT:    sll a6, a6, t6
-; RV32I-NEXT:    srli t0, t0, 1
-; RV32I-NEXT:    sub t3, t5, a1
-; RV32I-NEXT:    not t3, t3
-; RV32I-NEXT:    srl t0, t0, t3
-; RV32I-NEXT:    or t4, a6, t0
-; RV32I-NEXT:  .LBB6_9:
-; RV32I-NEXT:    slti a6, a7, 0
-; RV32I-NEXT:    neg a6, a6
-; RV32I-NEXT:    bltu a1, t5, .LBB6_13
-; RV32I-NEXT:  # %bb.10:
-; RV32I-NEXT:    slti t0, t1, 0
-; RV32I-NEXT:    neg t0, t0
-; RV32I-NEXT:    and t0, t0, a5
-; RV32I-NEXT:    bnez a1, .LBB6_14
-; RV32I-NEXT:    j .LBB6_15
-; RV32I-NEXT:  .LBB6_11:
-; RV32I-NEXT:    slti t3, s0, 0
-; RV32I-NEXT:    neg t3, t3
-; RV32I-NEXT:    and t3, t3, t4
-; RV32I-NEXT:    or t3, s1, t3
-; RV32I-NEXT:    beqz a1, .LBB6_7
-; RV32I-NEXT:  .LBB6_12:
-; RV32I-NEXT:    mv a0, t3
-; RV32I-NEXT:    bltz s0, .LBB6_8
-; RV32I-NEXT:    j .LBB6_9
-; RV32I-NEXT:  .LBB6_13:
-; RV32I-NEXT:    and t0, a6, t2
-; RV32I-NEXT:    or t0, t0, t4
-; RV32I-NEXT:    beqz a1, .LBB6_15
-; RV32I-NEXT:  .LBB6_14:
-; RV32I-NEXT:    mv a3, t0
-; RV32I-NEXT:  .LBB6_15:
-; RV32I-NEXT:    bltz a7, .LBB6_17
-; RV32I-NEXT:  # %bb.16:
-; RV32I-NEXT:    mv a4, a5
-; RV32I-NEXT:  .LBB6_17:
-; RV32I-NEXT:    sltiu a1, a1, 64
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    and a4, a1, a4
-; RV32I-NEXT:    and a5, a6, a5
-; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb a1, 12(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 10(a2)
-; RV32I-NEXT:    srli a5, a4, 24
-; RV32I-NEXT:    sb a5, 11(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    srli a4, a1, 24
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 13(a2)
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    sb a1, 2(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    lbu t1, 2(a0)
+; RV32I-NEXT:    lbu t2, 13(a0)
+; RV32I-NEXT:    lbu t3, 12(a0)
+; RV32I-NEXT:    lbu t4, 15(a0)
+; RV32I-NEXT:    lbu t5, 14(a0)
+; RV32I-NEXT:    lbu t6, 10(a0)
+; RV32I-NEXT:    lbu s0, 11(a0)
+; RV32I-NEXT:    lbu s1, 8(a0)
+; RV32I-NEXT:    lbu a0, 9(a0)
+; RV32I-NEXT:    sb t6, 10(a2)
+; RV32I-NEXT:    sb s0, 11(a2)
+; RV32I-NEXT:    sb s1, 8(a2)
+; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    sb t5, 14(a2)
+; RV32I-NEXT:    sb t4, 15(a2)
+; RV32I-NEXT:    sb t3, 12(a2)
+; RV32I-NEXT:    sb t2, 13(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb a7, 0(a2)
+; RV32I-NEXT:    sb a6, 1(a2)
+; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    sb a4, 7(a2)
 ; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    srli a0, a3, 24
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 48
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -1016,168 +946,98 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 9(a0)
-; RV32I-NEXT:    lbu a4, 8(a0)
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu a6, 11(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or a3, a6, a3
-; RV32I-NEXT:    lbu a4, 13(a0)
-; RV32I-NEXT:    lbu a5, 12(a0)
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t0, 15(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a7, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or t2, t0, a6
+; RV32I-NEXT:    addi sp, sp, -48
+; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 2(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sb zero, 19(sp)
+; RV32I-NEXT:    sb zero, 18(sp)
+; RV32I-NEXT:    sb zero, 17(sp)
+; RV32I-NEXT:    sb zero, 16(sp)
+; RV32I-NEXT:    sb zero, 15(sp)
+; RV32I-NEXT:    sb zero, 14(sp)
+; RV32I-NEXT:    sb zero, 13(sp)
+; RV32I-NEXT:    sb zero, 12(sp)
+; RV32I-NEXT:    sb zero, 11(sp)
+; RV32I-NEXT:    sb zero, 10(sp)
+; RV32I-NEXT:    sb zero, 9(sp)
+; RV32I-NEXT:    sb zero, 8(sp)
+; RV32I-NEXT:    sb zero, 7(sp)
+; RV32I-NEXT:    sb zero, 6(sp)
+; RV32I-NEXT:    sb zero, 5(sp)
+; RV32I-NEXT:    sb zero, 4(sp)
+; RV32I-NEXT:    sb a0, 35(sp)
+; RV32I-NEXT:    sb s2, 34(sp)
+; RV32I-NEXT:    sb s1, 33(sp)
+; RV32I-NEXT:    sb s0, 32(sp)
+; RV32I-NEXT:    sb t6, 31(sp)
+; RV32I-NEXT:    sb t5, 30(sp)
+; RV32I-NEXT:    sb t4, 29(sp)
+; RV32I-NEXT:    sb t3, 28(sp)
+; RV32I-NEXT:    sb t2, 27(sp)
+; RV32I-NEXT:    sb t1, 26(sp)
+; RV32I-NEXT:    sb t0, 25(sp)
+; RV32I-NEXT:    sb a7, 24(sp)
+; RV32I-NEXT:    sb a6, 23(sp)
+; RV32I-NEXT:    sb a5, 22(sp)
+; RV32I-NEXT:    sb a4, 21(sp)
+; RV32I-NEXT:    sb a3, 20(sp)
+; RV32I-NEXT:    andi a1, a1, 15
+; RV32I-NEXT:    addi a0, sp, 20
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    lbu a4, 7(a0)
+; RV32I-NEXT:    lbu a5, 6(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
+; RV32I-NEXT:    lbu a7, 0(a0)
 ; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a4, a6, a4
-; RV32I-NEXT:    or a6, t0, a4
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu t0, 6(a0)
-; RV32I-NEXT:    lbu a0, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a4, t0, a4
-; RV32I-NEXT:    or t0, a0, a4
-; RV32I-NEXT:    lbu a0, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a0, a5, a0
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    slli a1, a0, 3
-; RV32I-NEXT:    sll a0, t0, a1
-; RV32I-NEXT:    srli a4, a6, 1
-; RV32I-NEXT:    not t4, a1
-; RV32I-NEXT:    srl a4, a4, t4
-; RV32I-NEXT:    or a4, a0, a4
-; RV32I-NEXT:    addi t1, a1, -96
-; RV32I-NEXT:    sll a5, a6, a1
-; RV32I-NEXT:    mv t3, a4
-; RV32I-NEXT:    bltz t1, .LBB7_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv t3, a5
-; RV32I-NEXT:  .LBB7_2:
-; RV32I-NEXT:    or a0, t2, a7
-; RV32I-NEXT:    addi a7, a1, -32
-; RV32I-NEXT:    sll t2, a3, a1
-; RV32I-NEXT:    bltz a7, .LBB7_4
-; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv s1, t2
-; RV32I-NEXT:    j .LBB7_5
-; RV32I-NEXT:  .LBB7_4:
-; RV32I-NEXT:    sll t5, a0, a1
-; RV32I-NEXT:    srli t6, a3, 1
-; RV32I-NEXT:    srl t4, t6, t4
-; RV32I-NEXT:    or s1, t5, t4
-; RV32I-NEXT:  .LBB7_5:
-; RV32I-NEXT:    neg t6, a1
-; RV32I-NEXT:    srl t4, t0, t6
-; RV32I-NEXT:    li s0, 32
-; RV32I-NEXT:    li t5, 64
-; RV32I-NEXT:    sub s0, s0, a1
-; RV32I-NEXT:    bltu a1, t5, .LBB7_11
-; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    bnez a1, .LBB7_12
-; RV32I-NEXT:  .LBB7_7:
-; RV32I-NEXT:    bgez s0, .LBB7_9
-; RV32I-NEXT:  .LBB7_8:
-; RV32I-NEXT:    srl a6, a6, t6
-; RV32I-NEXT:    slli t0, t0, 1
-; RV32I-NEXT:    sub t3, t5, a1
-; RV32I-NEXT:    not t3, t3
-; RV32I-NEXT:    sll t0, t0, t3
-; RV32I-NEXT:    or t4, a6, t0
-; RV32I-NEXT:  .LBB7_9:
-; RV32I-NEXT:    slti a6, a7, 0
-; RV32I-NEXT:    neg a6, a6
-; RV32I-NEXT:    bltu a1, t5, .LBB7_13
-; RV32I-NEXT:  # %bb.10:
-; RV32I-NEXT:    slti t0, t1, 0
-; RV32I-NEXT:    neg t0, t0
-; RV32I-NEXT:    and t0, t0, a5
-; RV32I-NEXT:    bnez a1, .LBB7_14
-; RV32I-NEXT:    j .LBB7_15
-; RV32I-NEXT:  .LBB7_11:
-; RV32I-NEXT:    slti t3, s0, 0
-; RV32I-NEXT:    neg t3, t3
-; RV32I-NEXT:    and t3, t3, t4
-; RV32I-NEXT:    or t3, s1, t3
-; RV32I-NEXT:    beqz a1, .LBB7_7
-; RV32I-NEXT:  .LBB7_12:
-; RV32I-NEXT:    mv a0, t3
-; RV32I-NEXT:    bltz s0, .LBB7_8
-; RV32I-NEXT:    j .LBB7_9
-; RV32I-NEXT:  .LBB7_13:
-; RV32I-NEXT:    and t0, a6, t2
-; RV32I-NEXT:    or t0, t0, t4
-; RV32I-NEXT:    beqz a1, .LBB7_15
-; RV32I-NEXT:  .LBB7_14:
-; RV32I-NEXT:    mv a3, t0
-; RV32I-NEXT:  .LBB7_15:
-; RV32I-NEXT:    bltz a7, .LBB7_17
-; RV32I-NEXT:  # %bb.16:
-; RV32I-NEXT:    mv a4, a5
-; RV32I-NEXT:  .LBB7_17:
-; RV32I-NEXT:    sltiu a1, a1, 64
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    and a4, a1, a4
-; RV32I-NEXT:    and a5, a6, a5
-; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
-; RV32I-NEXT:    srli a5, a1, 16
-; RV32I-NEXT:    sb a5, 2(a2)
-; RV32I-NEXT:    srli a5, a1, 24
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 6(a2)
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a0, 12(a2)
-; RV32I-NEXT:    sb a3, 8(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    sb a1, 14(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    sb a1, 15(a2)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 13(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    srli a0, a3, 24
-; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 9(a2)
-; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    lbu t1, 2(a0)
+; RV32I-NEXT:    lbu t2, 13(a0)
+; RV32I-NEXT:    lbu t3, 12(a0)
+; RV32I-NEXT:    lbu t4, 15(a0)
+; RV32I-NEXT:    lbu t5, 14(a0)
+; RV32I-NEXT:    lbu t6, 10(a0)
+; RV32I-NEXT:    lbu s0, 11(a0)
+; RV32I-NEXT:    lbu s1, 8(a0)
+; RV32I-NEXT:    lbu a0, 9(a0)
+; RV32I-NEXT:    sb t6, 10(a2)
+; RV32I-NEXT:    sb s0, 11(a2)
+; RV32I-NEXT:    sb s1, 8(a2)
+; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    sb t5, 14(a2)
+; RV32I-NEXT:    sb t4, 15(a2)
+; RV32I-NEXT:    sb t3, 12(a2)
+; RV32I-NEXT:    sb t2, 13(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb a7, 0(a2)
+; RV32I-NEXT:    sb a6, 1(a2)
+; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    sb a4, 7(a2)
+; RV32I-NEXT:    sb a3, 4(a2)
+; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 48
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -1305,186 +1165,106 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    lbu a4, 4(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 7(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or a3, a6, a3
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    addi sp, sp, -48
+; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 15(a0)
+; RV32I-NEXT:    slli a4, a3, 24
 ; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 2(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
+; RV32I-NEXT:    lbu a7, 2(a0)
 ; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a7, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or t0, t0, a6
-; RV32I-NEXT:    lbu a4, 13(a0)
-; RV32I-NEXT:    lbu a5, 12(a0)
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a5, a6, 16
-; RV32I-NEXT:    slli a6, t1, 24
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    or t1, a6, a4
-; RV32I-NEXT:    lbu a4, 9(a0)
-; RV32I-NEXT:    lbu a5, 8(a0)
-; RV32I-NEXT:    lbu t2, 10(a0)
-; RV32I-NEXT:    lbu a0, 11(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a4, t2, a4
-; RV32I-NEXT:    or t2, a0, a4
-; RV32I-NEXT:    lbu a0, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a0, a5, a0
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    slli a5, a0, 3
-; RV32I-NEXT:    srl a0, t2, a5
-; RV32I-NEXT:    slli a1, t1, 1
-; RV32I-NEXT:    not t5, a5
-; RV32I-NEXT:    sll a1, a1, t5
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    addi t3, a5, -96
-; RV32I-NEXT:    sra a4, t1, a5
-; RV32I-NEXT:    mv t6, a0
-; RV32I-NEXT:    bltz t3, .LBB8_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv t6, a4
-; RV32I-NEXT:  .LBB8_2:
-; RV32I-NEXT:    or a1, t0, a7
-; RV32I-NEXT:    addi a7, a5, -32
-; RV32I-NEXT:    srl t4, a3, a5
-; RV32I-NEXT:    bltz a7, .LBB8_4
-; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv s2, t4
-; RV32I-NEXT:    j .LBB8_5
-; RV32I-NEXT:  .LBB8_4:
-; RV32I-NEXT:    srl t0, a1, a5
-; RV32I-NEXT:    slli s0, a3, 1
-; RV32I-NEXT:    sll t5, s0, t5
-; RV32I-NEXT:    or s2, t0, t5
-; RV32I-NEXT:  .LBB8_5:
-; RV32I-NEXT:    neg s0, a5
-; RV32I-NEXT:    sll t5, t2, s0
-; RV32I-NEXT:    li s1, 32
-; RV32I-NEXT:    li t0, 64
-; RV32I-NEXT:    sub s1, s1, a5
-; RV32I-NEXT:    bltu a5, t0, .LBB8_18
-; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    bnez a5, .LBB8_19
-; RV32I-NEXT:  .LBB8_7:
-; RV32I-NEXT:    bgez s1, .LBB8_9
-; RV32I-NEXT:  .LBB8_8:
-; RV32I-NEXT:    sll t1, t1, s0
-; RV32I-NEXT:    srli t2, t2, 1
-; RV32I-NEXT:    sub t5, t0, a5
-; RV32I-NEXT:    not t5, t5
-; RV32I-NEXT:    srl t2, t2, t5
-; RV32I-NEXT:    or t5, t1, t2
-; RV32I-NEXT:  .LBB8_9:
-; RV32I-NEXT:    srai a6, a6, 31
-; RV32I-NEXT:    mv t1, a4
-; RV32I-NEXT:    bgez t3, .LBB8_20
-; RV32I-NEXT:  # %bb.10:
-; RV32I-NEXT:    bltu a5, t0, .LBB8_21
-; RV32I-NEXT:  .LBB8_11:
-; RV32I-NEXT:    bnez a5, .LBB8_22
-; RV32I-NEXT:  .LBB8_12:
-; RV32I-NEXT:    bgez a7, .LBB8_23
-; RV32I-NEXT:  .LBB8_13:
-; RV32I-NEXT:    bgeu a5, t0, .LBB8_24
-; RV32I-NEXT:  .LBB8_14:
-; RV32I-NEXT:    bgez a7, .LBB8_25
-; RV32I-NEXT:  .LBB8_15:
-; RV32I-NEXT:    bltu a5, t0, .LBB8_17
-; RV32I-NEXT:  .LBB8_16:
-; RV32I-NEXT:    mv a4, a6
-; RV32I-NEXT:  .LBB8_17:
-; RV32I-NEXT:    sb a4, 12(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 14(a2)
-; RV32I-NEXT:    srli a5, a4, 24
-; RV32I-NEXT:    sb a5, 15(a2)
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t2, 5(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 7(a0)
+; RV32I-NEXT:    lbu t5, 8(a0)
+; RV32I-NEXT:    lbu t6, 9(a0)
+; RV32I-NEXT:    lbu s0, 10(a0)
+; RV32I-NEXT:    lbu s1, 11(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s3, 14(a0)
+; RV32I-NEXT:    lbu a0, 13(a0)
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb s3, 14(sp)
+; RV32I-NEXT:    sb a0, 13(sp)
+; RV32I-NEXT:    sb s2, 12(sp)
+; RV32I-NEXT:    sb s1, 11(sp)
+; RV32I-NEXT:    sb s0, 10(sp)
+; RV32I-NEXT:    sb t6, 9(sp)
+; RV32I-NEXT:    sb t5, 8(sp)
+; RV32I-NEXT:    sb t4, 7(sp)
+; RV32I-NEXT:    sb t3, 6(sp)
+; RV32I-NEXT:    sb t2, 5(sp)
+; RV32I-NEXT:    sb t1, 4(sp)
+; RV32I-NEXT:    sb t0, 3(sp)
+; RV32I-NEXT:    sb a7, 2(sp)
+; RV32I-NEXT:    sb a6, 1(sp)
+; RV32I-NEXT:    sb a5, 0(sp)
+; RV32I-NEXT:    srai a4, a4, 31
+; RV32I-NEXT:    sb a4, 28(sp)
+; RV32I-NEXT:    sb a4, 24(sp)
+; RV32I-NEXT:    sb a4, 20(sp)
+; RV32I-NEXT:    sb a4, 16(sp)
+; RV32I-NEXT:    srli a0, a4, 24
+; RV32I-NEXT:    sb a0, 31(sp)
+; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    sb a3, 30(sp)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 13(a2)
-; RV32I-NEXT:    sb a0, 8(a2)
-; RV32I-NEXT:    srli a4, a0, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, a0, 24
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a4, 29(sp)
+; RV32I-NEXT:    sb a0, 27(sp)
+; RV32I-NEXT:    sb a3, 26(sp)
+; RV32I-NEXT:    sb a4, 25(sp)
+; RV32I-NEXT:    sb a0, 23(sp)
+; RV32I-NEXT:    sb a3, 22(sp)
+; RV32I-NEXT:    sb a4, 21(sp)
+; RV32I-NEXT:    sb a0, 19(sp)
+; RV32I-NEXT:    sb a3, 18(sp)
+; RV32I-NEXT:    sb a4, 17(sp)
+; RV32I-NEXT:    andi a1, a1, 15
+; RV32I-NEXT:    mv a0, sp
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    lbu a4, 7(a0)
+; RV32I-NEXT:    lbu a5, 6(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
+; RV32I-NEXT:    lbu a7, 0(a0)
+; RV32I-NEXT:    lbu t0, 3(a0)
+; RV32I-NEXT:    lbu t1, 2(a0)
+; RV32I-NEXT:    lbu t2, 13(a0)
+; RV32I-NEXT:    lbu t3, 12(a0)
+; RV32I-NEXT:    lbu t4, 15(a0)
+; RV32I-NEXT:    lbu t5, 14(a0)
+; RV32I-NEXT:    lbu t6, 10(a0)
+; RV32I-NEXT:    lbu s0, 11(a0)
+; RV32I-NEXT:    lbu s1, 8(a0)
+; RV32I-NEXT:    lbu a0, 9(a0)
+; RV32I-NEXT:    sb t6, 10(a2)
+; RV32I-NEXT:    sb s0, 11(a2)
+; RV32I-NEXT:    sb s1, 8(a2)
 ; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    srli a0, a1, 16
-; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    srli a0, a1, 24
-; RV32I-NEXT:    sb a0, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    sb t5, 14(a2)
+; RV32I-NEXT:    sb t4, 15(a2)
+; RV32I-NEXT:    sb t3, 12(a2)
+; RV32I-NEXT:    sb t2, 13(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb a7, 0(a2)
+; RV32I-NEXT:    sb a6, 1(a2)
+; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    sb a4, 7(a2)
 ; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    srli a0, a3, 24
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 48
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB8_18:
-; RV32I-NEXT:    slti t6, s1, 0
-; RV32I-NEXT:    neg t6, t6
-; RV32I-NEXT:    and t6, t6, t5
-; RV32I-NEXT:    or t6, s2, t6
-; RV32I-NEXT:    beqz a5, .LBB8_7
-; RV32I-NEXT:  .LBB8_19:
-; RV32I-NEXT:    mv a1, t6
-; RV32I-NEXT:    bltz s1, .LBB8_8
-; RV32I-NEXT:    j .LBB8_9
-; RV32I-NEXT:  .LBB8_20:
-; RV32I-NEXT:    mv t1, a6
-; RV32I-NEXT:    bgeu a5, t0, .LBB8_11
-; RV32I-NEXT:  .LBB8_21:
-; RV32I-NEXT:    slti t1, a7, 0
-; RV32I-NEXT:    neg t1, t1
-; RV32I-NEXT:    and t1, t1, t4
-; RV32I-NEXT:    or t1, t1, t5
-; RV32I-NEXT:    beqz a5, .LBB8_12
-; RV32I-NEXT:  .LBB8_22:
-; RV32I-NEXT:    mv a3, t1
-; RV32I-NEXT:    bltz a7, .LBB8_13
-; RV32I-NEXT:  .LBB8_23:
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    bltu a5, t0, .LBB8_14
-; RV32I-NEXT:  .LBB8_24:
-; RV32I-NEXT:    mv a0, a6
-; RV32I-NEXT:    bltz a7, .LBB8_15
-; RV32I-NEXT:  .LBB8_25:
-; RV32I-NEXT:    mv a4, a6
-; RV32I-NEXT:    bgeu a5, t0, .LBB8_16
-; RV32I-NEXT:    j .LBB8_17
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
   %bitOff = shl i128 %byteOff, 3
@@ -1496,824 +1276,438 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
-; RV64I-NEXT:    lbu a5, 10(a0)
-; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, t0, a4
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a3, a3, a6
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a6, 2(a0)
-; RV64I-NEXT:    lbu a7, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or a5, a7, a4
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a6, 4(a0)
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu t0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or t0, t0, a4
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    lbu a4, 25(a0)
-; RV64I-NEXT:    lbu a6, 24(a0)
+; RV64I-NEXT:    addi sp, sp, -224
+; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 2(a0)
+; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 3(a0)
+; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 4(a0)
+; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 5(a0)
+; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    lbu ra, 24(a0)
+; RV64I-NEXT:    lbu t0, 25(a0)
 ; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 29(a0)
-; RV64I-NEXT:    lbu a7, 28(a0)
-; RV64I-NEXT:    lbu t2, 30(a0)
-; RV64I-NEXT:    lbu t3, 31(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or a6, t2, a6
-; RV64I-NEXT:    or a6, t3, a6
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or a4, a4, t1
-; RV64I-NEXT:    lbu a6, 17(a0)
-; RV64I-NEXT:    lbu a7, 16(a0)
-; RV64I-NEXT:    lbu t1, 18(a0)
-; RV64I-NEXT:    lbu t2, 19(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu a7, 21(a0)
-; RV64I-NEXT:    lbu t1, 20(a0)
-; RV64I-NEXT:    lbu t3, 22(a0)
-; RV64I-NEXT:    lbu a0, 23(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t1
-; RV64I-NEXT:    slli t3, t3, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a7, t3, a7
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    or t1, a0, t2
-; RV64I-NEXT:    lbu a0, 5(a1)
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
-; RV64I-NEXT:    lbu t2, 7(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a0, a7, a0
-; RV64I-NEXT:    or a0, t2, a0
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t2, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a6, t2, a6
-; RV64I-NEXT:    or a1, a1, a6
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a0, a0, 35
-; RV64I-NEXT:    or a1, a0, a1
-; RV64I-NEXT:    srl a0, t1, a1
-; RV64I-NEXT:    not t4, a1
-; RV64I-NEXT:    slli a6, a4, 1
-; RV64I-NEXT:    sll a6, a6, t4
-; RV64I-NEXT:    or a6, a0, a6
-; RV64I-NEXT:    addi t2, a1, -192
-; RV64I-NEXT:    srl a7, a4, a1
-; RV64I-NEXT:    mv t3, a6
-; RV64I-NEXT:    bltz t2, .LBB9_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    mv t3, a7
-; RV64I-NEXT:  .LBB9_2:
-; RV64I-NEXT:    or a0, t0, a5
-; RV64I-NEXT:    addi a5, a1, -64
-; RV64I-NEXT:    srl t0, a3, a1
-; RV64I-NEXT:    bltz a5, .LBB9_4
-; RV64I-NEXT:  # %bb.3:
-; RV64I-NEXT:    mv s1, t0
-; RV64I-NEXT:    j .LBB9_5
-; RV64I-NEXT:  .LBB9_4:
-; RV64I-NEXT:    srl t5, a0, a1
-; RV64I-NEXT:    slli t6, a3, 1
-; RV64I-NEXT:    sll t4, t6, t4
-; RV64I-NEXT:    or s1, t5, t4
-; RV64I-NEXT:  .LBB9_5:
-; RV64I-NEXT:    negw t6, a1
-; RV64I-NEXT:    sll t4, t1, t6
-; RV64I-NEXT:    li s0, 64
-; RV64I-NEXT:    li t5, 128
-; RV64I-NEXT:    sub s0, s0, a1
-; RV64I-NEXT:    bltu a1, t5, .LBB9_11
-; RV64I-NEXT:  # %bb.6:
-; RV64I-NEXT:    bnez a1, .LBB9_12
-; RV64I-NEXT:  .LBB9_7:
-; RV64I-NEXT:    bgez s0, .LBB9_9
-; RV64I-NEXT:  .LBB9_8:
-; RV64I-NEXT:    sll a4, a4, t6
-; RV64I-NEXT:    srli t1, t1, 1
-; RV64I-NEXT:    subw t3, t5, a1
-; RV64I-NEXT:    not t3, t3
-; RV64I-NEXT:    srl t1, t1, t3
-; RV64I-NEXT:    or t4, a4, t1
-; RV64I-NEXT:  .LBB9_9:
-; RV64I-NEXT:    slti a4, a5, 0
-; RV64I-NEXT:    neg a4, a4
-; RV64I-NEXT:    bltu a1, t5, .LBB9_13
-; RV64I-NEXT:  # %bb.10:
-; RV64I-NEXT:    slti t0, t2, 0
-; RV64I-NEXT:    neg t0, t0
-; RV64I-NEXT:    and t0, t0, a7
-; RV64I-NEXT:    bnez a1, .LBB9_14
-; RV64I-NEXT:    j .LBB9_15
-; RV64I-NEXT:  .LBB9_11:
-; RV64I-NEXT:    slti t3, s0, 0
-; RV64I-NEXT:    neg t3, t3
-; RV64I-NEXT:    and t3, t3, t4
-; RV64I-NEXT:    or t3, s1, t3
-; RV64I-NEXT:    beqz a1, .LBB9_7
-; RV64I-NEXT:  .LBB9_12:
-; RV64I-NEXT:    mv a0, t3
-; RV64I-NEXT:    bltz s0, .LBB9_8
-; RV64I-NEXT:    j .LBB9_9
-; RV64I-NEXT:  .LBB9_13:
-; RV64I-NEXT:    and t0, a4, t0
-; RV64I-NEXT:    or t0, t0, t4
-; RV64I-NEXT:    beqz a1, .LBB9_15
-; RV64I-NEXT:  .LBB9_14:
-; RV64I-NEXT:    mv a3, t0
-; RV64I-NEXT:  .LBB9_15:
-; RV64I-NEXT:    bltz a5, .LBB9_17
-; RV64I-NEXT:  # %bb.16:
-; RV64I-NEXT:    mv a6, a7
-; RV64I-NEXT:  .LBB9_17:
-; RV64I-NEXT:    sltiu a1, a1, 128
-; RV64I-NEXT:    neg a1, a1
-; RV64I-NEXT:    and a5, a1, a6
-; RV64I-NEXT:    and a4, a4, a7
-; RV64I-NEXT:    and a1, a1, a4
-; RV64I-NEXT:    sb a5, 16(a2)
-; RV64I-NEXT:    sb a1, 24(a2)
-; RV64I-NEXT:    srli a4, a5, 56
-; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a5, 48
-; RV64I-NEXT:    sb a4, 22(a2)
-; RV64I-NEXT:    srli a4, a5, 40
-; RV64I-NEXT:    sb a4, 21(a2)
-; RV64I-NEXT:    srli a4, a5, 32
-; RV64I-NEXT:    sb a4, 20(a2)
-; RV64I-NEXT:    srli a4, a5, 24
+; RV64I-NEXT:    lbu a6, 27(a0)
+; RV64I-NEXT:    lbu a5, 28(a0)
+; RV64I-NEXT:    lbu a3, 31(a0)
+; RV64I-NEXT:    lbu a4, 30(a0)
+; RV64I-NEXT:    lbu a0, 29(a0)
+; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    sb a3, 87(sp)
+; RV64I-NEXT:    sb a4, 86(sp)
+; RV64I-NEXT:    sb a0, 85(sp)
+; RV64I-NEXT:    sb a5, 84(sp)
+; RV64I-NEXT:    sb a6, 83(sp)
+; RV64I-NEXT:    sb a7, 82(sp)
+; RV64I-NEXT:    sb zero, 119(sp)
+; RV64I-NEXT:    sb zero, 118(sp)
+; RV64I-NEXT:    sb zero, 117(sp)
+; RV64I-NEXT:    sb zero, 116(sp)
+; RV64I-NEXT:    sb zero, 115(sp)
+; RV64I-NEXT:    sb zero, 114(sp)
+; RV64I-NEXT:    sb zero, 113(sp)
+; RV64I-NEXT:    sb zero, 112(sp)
+; RV64I-NEXT:    sb zero, 111(sp)
+; RV64I-NEXT:    sb zero, 110(sp)
+; RV64I-NEXT:    sb zero, 109(sp)
+; RV64I-NEXT:    sb zero, 108(sp)
+; RV64I-NEXT:    sb zero, 107(sp)
+; RV64I-NEXT:    sb zero, 106(sp)
+; RV64I-NEXT:    sb zero, 105(sp)
+; RV64I-NEXT:    sb zero, 104(sp)
+; RV64I-NEXT:    sb zero, 103(sp)
+; RV64I-NEXT:    sb zero, 102(sp)
+; RV64I-NEXT:    sb zero, 101(sp)
+; RV64I-NEXT:    sb zero, 100(sp)
+; RV64I-NEXT:    sb zero, 99(sp)
+; RV64I-NEXT:    sb zero, 98(sp)
+; RV64I-NEXT:    sb zero, 97(sp)
+; RV64I-NEXT:    sb zero, 96(sp)
+; RV64I-NEXT:    sb zero, 95(sp)
+; RV64I-NEXT:    sb zero, 94(sp)
+; RV64I-NEXT:    sb zero, 93(sp)
+; RV64I-NEXT:    sb zero, 92(sp)
+; RV64I-NEXT:    sb zero, 91(sp)
+; RV64I-NEXT:    sb zero, 90(sp)
+; RV64I-NEXT:    sb zero, 89(sp)
+; RV64I-NEXT:    sb zero, 88(sp)
+; RV64I-NEXT:    sb t0, 81(sp)
+; RV64I-NEXT:    sb ra, 80(sp)
+; RV64I-NEXT:    sb s11, 79(sp)
+; RV64I-NEXT:    sb s10, 78(sp)
+; RV64I-NEXT:    sb s9, 77(sp)
+; RV64I-NEXT:    sb s8, 76(sp)
+; RV64I-NEXT:    sb s7, 75(sp)
+; RV64I-NEXT:    sb s6, 74(sp)
+; RV64I-NEXT:    sb s5, 73(sp)
+; RV64I-NEXT:    sb s4, 72(sp)
+; RV64I-NEXT:    sb s3, 71(sp)
+; RV64I-NEXT:    sb s2, 70(sp)
+; RV64I-NEXT:    sb s1, 69(sp)
+; RV64I-NEXT:    sb s0, 68(sp)
+; RV64I-NEXT:    sb t6, 67(sp)
+; RV64I-NEXT:    sb t5, 66(sp)
+; RV64I-NEXT:    sb t4, 65(sp)
+; RV64I-NEXT:    sb t3, 64(sp)
+; RV64I-NEXT:    sb t2, 63(sp)
+; RV64I-NEXT:    sb t1, 62(sp)
+; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 61(sp)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 60(sp)
+; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 59(sp)
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 58(sp)
+; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 57(sp)
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 56(sp)
+; RV64I-NEXT:    andi a1, a1, 31
+; RV64I-NEXT:    addi a0, sp, 56
+; RV64I-NEXT:    add a5, a0, a1
+; RV64I-NEXT:    lbu a0, 8(a5)
+; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 9(a5)
+; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 10(a5)
+; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 11(a5)
+; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 12(a5)
+; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a7, 13(a5)
+; RV64I-NEXT:    lbu t0, 14(a5)
+; RV64I-NEXT:    lbu t1, 15(a5)
+; RV64I-NEXT:    lbu t2, 0(a5)
+; RV64I-NEXT:    lbu t3, 1(a5)
+; RV64I-NEXT:    lbu t4, 2(a5)
+; RV64I-NEXT:    lbu t5, 3(a5)
+; RV64I-NEXT:    lbu t6, 4(a5)
+; RV64I-NEXT:    lbu s0, 5(a5)
+; RV64I-NEXT:    lbu s1, 6(a5)
+; RV64I-NEXT:    lbu s2, 7(a5)
+; RV64I-NEXT:    lbu s3, 24(a5)
+; RV64I-NEXT:    lbu s4, 25(a5)
+; RV64I-NEXT:    lbu s5, 26(a5)
+; RV64I-NEXT:    lbu s6, 27(a5)
+; RV64I-NEXT:    lbu s7, 28(a5)
+; RV64I-NEXT:    lbu s8, 29(a5)
+; RV64I-NEXT:    lbu s9, 30(a5)
+; RV64I-NEXT:    lbu s10, 31(a5)
+; RV64I-NEXT:    lbu s11, 16(a5)
+; RV64I-NEXT:    lbu ra, 17(a5)
+; RV64I-NEXT:    lbu a6, 18(a5)
+; RV64I-NEXT:    lbu a4, 19(a5)
+; RV64I-NEXT:    lbu a0, 23(a5)
+; RV64I-NEXT:    lbu a1, 22(a5)
+; RV64I-NEXT:    lbu a3, 21(a5)
+; RV64I-NEXT:    lbu a5, 20(a5)
+; RV64I-NEXT:    sb a0, 23(a2)
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    sb a3, 21(a2)
+; RV64I-NEXT:    sb a5, 20(a2)
 ; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    srli a4, a5, 16
-; RV64I-NEXT:    sb a4, 18(a2)
-; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a4, a1, 56
-; RV64I-NEXT:    sb a4, 31(a2)
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    sb a4, 30(a2)
-; RV64I-NEXT:    srli a4, a1, 40
-; RV64I-NEXT:    sb a4, 29(a2)
-; RV64I-NEXT:    srli a4, a1, 32
-; RV64I-NEXT:    sb a4, 28(a2)
-; RV64I-NEXT:    srli a4, a1, 24
-; RV64I-NEXT:    sb a4, 27(a2)
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    sb a4, 26(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 25(a2)
-; RV64I-NEXT:    sb a0, 0(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 48
-; RV64I-NEXT:    sb a1, 6(a2)
-; RV64I-NEXT:    srli a1, a0, 40
-; RV64I-NEXT:    sb a1, 5(a2)
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    sb a1, 4(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    sb a1, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    sb a0, 1(a2)
-; RV64I-NEXT:    sb a3, 8(a2)
-; RV64I-NEXT:    srli a0, a3, 56
-; RV64I-NEXT:    sb a0, 15(a2)
-; RV64I-NEXT:    srli a0, a3, 48
-; RV64I-NEXT:    sb a0, 14(a2)
-; RV64I-NEXT:    srli a0, a3, 40
-; RV64I-NEXT:    sb a0, 13(a2)
-; RV64I-NEXT:    srli a0, a3, 32
+; RV64I-NEXT:    sb a6, 18(a2)
+; RV64I-NEXT:    sb ra, 17(a2)
+; RV64I-NEXT:    sb s11, 16(a2)
+; RV64I-NEXT:    sb s10, 31(a2)
+; RV64I-NEXT:    sb s9, 30(a2)
+; RV64I-NEXT:    sb s8, 29(a2)
+; RV64I-NEXT:    sb s7, 28(a2)
+; RV64I-NEXT:    sb s6, 27(a2)
+; RV64I-NEXT:    sb s5, 26(a2)
+; RV64I-NEXT:    sb s4, 25(a2)
+; RV64I-NEXT:    sb s3, 24(a2)
+; RV64I-NEXT:    sb s2, 7(a2)
+; RV64I-NEXT:    sb s1, 6(a2)
+; RV64I-NEXT:    sb s0, 5(a2)
+; RV64I-NEXT:    sb t6, 4(a2)
+; RV64I-NEXT:    sb t5, 3(a2)
+; RV64I-NEXT:    sb t4, 2(a2)
+; RV64I-NEXT:    sb t3, 1(a2)
+; RV64I-NEXT:    sb t2, 0(a2)
+; RV64I-NEXT:    sb t1, 15(a2)
+; RV64I-NEXT:    sb t0, 14(a2)
+; RV64I-NEXT:    sb a7, 13(a2)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    srli a0, a3, 24
+; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    srli a0, a3, 16
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 224
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -112
-; RV32I-NEXT:    sw ra, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a6, 4(a0)
-; RV32I-NEXT:    lbu t4, 5(a0)
-; RV32I-NEXT:    lbu t6, 6(a0)
-; RV32I-NEXT:    lbu s0, 7(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 1(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t3, 3(a0)
-; RV32I-NEXT:    lbu s1, 12(a0)
-; RV32I-NEXT:    lbu t5, 13(a0)
-; RV32I-NEXT:    lbu s3, 14(a0)
-; RV32I-NEXT:    lbu s6, 15(a0)
-; RV32I-NEXT:    lbu s2, 8(a0)
-; RV32I-NEXT:    lbu s7, 9(a0)
-; RV32I-NEXT:    lbu s8, 10(a0)
-; RV32I-NEXT:    lbu s9, 11(a0)
-; RV32I-NEXT:    lbu a3, 21(a0)
-; RV32I-NEXT:    lbu a4, 20(a0)
-; RV32I-NEXT:    lbu a5, 22(a0)
-; RV32I-NEXT:    lbu t2, 23(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or a3, t2, a3
-; RV32I-NEXT:    lbu a4, 17(a0)
-; RV32I-NEXT:    lbu a5, 16(a0)
-; RV32I-NEXT:    lbu t2, 18(a0)
-; RV32I-NEXT:    lbu s4, 19(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli s4, s4, 24
-; RV32I-NEXT:    or a4, t2, a4
-; RV32I-NEXT:    or s10, s4, a4
-; RV32I-NEXT:    lbu a4, 29(a0)
+; RV32I-NEXT:    addi sp, sp, -144
+; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 2(a0)
+; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 3(a0)
+; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 5(a0)
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    lbu ra, 24(a0)
+; RV32I-NEXT:    lbu t0, 25(a0)
+; RV32I-NEXT:    lbu a7, 26(a0)
+; RV32I-NEXT:    lbu a6, 27(a0)
 ; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lbu t2, 30(a0)
-; RV32I-NEXT:    lbu s4, 31(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli s4, s4, 24
-; RV32I-NEXT:    or a4, t2, a4
-; RV32I-NEXT:    or s11, s4, a4
-; RV32I-NEXT:    lbu a4, 25(a0)
-; RV32I-NEXT:    lbu a5, 24(a0)
-; RV32I-NEXT:    lbu t2, 26(a0)
-; RV32I-NEXT:    lbu a0, 27(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a4, t2, a4
-; RV32I-NEXT:    or s5, a0, a4
-; RV32I-NEXT:    lbu a0, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a0, a5, a0
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    slli a5, a0, 3
-; RV32I-NEXT:    srl a0, s5, a5
-; RV32I-NEXT:    slli a1, s11, 1
-; RV32I-NEXT:    not s4, a5
-; RV32I-NEXT:    sll a1, a1, s4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    addi a4, a5, -224
-; RV32I-NEXT:    sw s11, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    srl a1, s11, a5
-; RV32I-NEXT:    sw a0, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw a4, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz a4, .LBB9_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:  .LBB9_2:
-; RV32I-NEXT:    sw a1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    slli t2, t5, 8
-; RV32I-NEXT:    slli s3, s3, 16
-; RV32I-NEXT:    slli s11, s6, 24
-; RV32I-NEXT:    slli ra, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    srl a1, s10, a5
-; RV32I-NEXT:    slli t5, a3, 1
-; RV32I-NEXT:    sll a4, t5, s4
-; RV32I-NEXT:    or s6, a1, a4
-; RV32I-NEXT:    addi s7, a5, -160
-; RV32I-NEXT:    srl a1, a3, a5
-; RV32I-NEXT:    sw a1, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz s7, .LBB9_4
-; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    lw s6, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB9_4:
-; RV32I-NEXT:    slli a4, t4, 8
-; RV32I-NEXT:    slli t6, t6, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or t2, t2, s1
-; RV32I-NEXT:    or s1, s11, s3
-; RV32I-NEXT:    or s2, ra, s2
-; RV32I-NEXT:    or s3, s9, s8
-; RV32I-NEXT:    neg s11, a5
-; RV32I-NEXT:    sll t4, s5, s11
-; RV32I-NEXT:    li a1, 160
-; RV32I-NEXT:    addi s8, a5, -128
-; RV32I-NEXT:    li s9, 64
-; RV32I-NEXT:    sub a1, a1, a5
-; RV32I-NEXT:    bltu s8, s9, .LBB9_6
-; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    mv ra, t4
-; RV32I-NEXT:    j .LBB9_7
-; RV32I-NEXT:  .LBB9_6:
-; RV32I-NEXT:    slti a0, a1, 0
-; RV32I-NEXT:    neg a0, a0
-; RV32I-NEXT:    mv ra, t4
-; RV32I-NEXT:    and a0, a0, t4
-; RV32I-NEXT:    or a0, s6, a0
-; RV32I-NEXT:  .LBB9_7:
-; RV32I-NEXT:    sw a1, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli s6, t3, 24
-; RV32I-NEXT:    or t3, a4, a6
-; RV32I-NEXT:    or t6, s0, t6
-; RV32I-NEXT:    or a6, s1, t2
-; RV32I-NEXT:    or s3, s3, s2
-; RV32I-NEXT:    mv s9, s10
-; RV32I-NEXT:    beqz s8, .LBB9_9
-; RV32I-NEXT:  # %bb.8:
-; RV32I-NEXT:    mv s9, a0
-; RV32I-NEXT:  .LBB9_9:
-; RV32I-NEXT:    or a4, t0, a7
-; RV32I-NEXT:    or t0, s6, t1
-; RV32I-NEXT:    or a0, t6, t3
-; RV32I-NEXT:    srl a1, s3, a5
-; RV32I-NEXT:    slli a7, a6, 1
-; RV32I-NEXT:    sll a7, a7, s4
-; RV32I-NEXT:    or s2, a1, a7
-; RV32I-NEXT:    addi t2, a5, -96
-; RV32I-NEXT:    srl a1, a6, a5
-; RV32I-NEXT:    sw a1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv a7, s2
-; RV32I-NEXT:    bltz t2, .LBB9_11
-; RV32I-NEXT:  # %bb.10:
-; RV32I-NEXT:    lw a7, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB9_11:
-; RV32I-NEXT:    or t0, t0, a4
-; RV32I-NEXT:    addi t6, a5, -32
-; RV32I-NEXT:    srl t1, a0, a5
-; RV32I-NEXT:    sw t1, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgez t6, .LBB9_13
-; RV32I-NEXT:  # %bb.12:
-; RV32I-NEXT:    srl a1, t0, a5
-; RV32I-NEXT:    slli a4, a0, 1
-; RV32I-NEXT:    sll a4, a4, s4
-; RV32I-NEXT:    or t1, a1, a4
-; RV32I-NEXT:  .LBB9_13:
-; RV32I-NEXT:    sll t3, s3, s11
-; RV32I-NEXT:    li a4, 32
-; RV32I-NEXT:    sub s0, a4, a5
-; RV32I-NEXT:    slti a1, s0, 0
-; RV32I-NEXT:    neg s1, a1
-; RV32I-NEXT:    li a1, 64
-; RV32I-NEXT:    bgeu a5, a1, .LBB9_15
-; RV32I-NEXT:  # %bb.14:
-; RV32I-NEXT:    and a1, s1, t3
-; RV32I-NEXT:    or a7, t1, a1
-; RV32I-NEXT:  .LBB9_15:
-; RV32I-NEXT:    mv t4, s7
-; RV32I-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s4, t0
-; RV32I-NEXT:    beqz a5, .LBB9_17
-; RV32I-NEXT:  # %bb.16:
-; RV32I-NEXT:    mv s4, a7
-; RV32I-NEXT:  .LBB9_17:
-; RV32I-NEXT:    sll s1, s10, s11
-; RV32I-NEXT:    li a1, 96
-; RV32I-NEXT:    sub s6, a1, a5
-; RV32I-NEXT:    slti a1, s6, 0
-; RV32I-NEXT:    neg a7, a1
-; RV32I-NEXT:    li s7, 128
-; RV32I-NEXT:    sub t1, s7, a5
-; RV32I-NEXT:    sltiu a1, t1, 64
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgeu a5, s7, .LBB9_19
-; RV32I-NEXT:  # %bb.18:
-; RV32I-NEXT:    mv s7, a1
-; RV32I-NEXT:    and a1, a7, s1
-; RV32I-NEXT:    and a1, s7, a1
-; RV32I-NEXT:    or s9, s4, a1
-; RV32I-NEXT:  .LBB9_19:
-; RV32I-NEXT:    mv s4, ra
-; RV32I-NEXT:    mv s7, t4
-; RV32I-NEXT:    li ra, 64
-; RV32I-NEXT:    beqz a5, .LBB9_21
-; RV32I-NEXT:  # %bb.20:
-; RV32I-NEXT:    mv t0, s9
-; RV32I-NEXT:  .LBB9_21:
-; RV32I-NEXT:    neg a1, t1
-; RV32I-NEXT:    sub a4, a4, t1
-; RV32I-NEXT:    srl t4, a3, a1
-; RV32I-NEXT:    sw a4, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz a4, .LBB9_24
-; RV32I-NEXT:  # %bb.22:
-; RV32I-NEXT:    mv a1, t4
-; RV32I-NEXT:    lw t5, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgeu t1, ra, .LBB9_25
-; RV32I-NEXT:  .LBB9_23:
-; RV32I-NEXT:    and a4, a7, s4
-; RV32I-NEXT:    or a7, a4, a1
-; RV32I-NEXT:    mv a4, s5
-; RV32I-NEXT:    bnez t1, .LBB9_26
-; RV32I-NEXT:    j .LBB9_27
-; RV32I-NEXT:  .LBB9_24:
-; RV32I-NEXT:    srl a1, s10, a1
-; RV32I-NEXT:    sub a4, ra, t1
-; RV32I-NEXT:    not a4, a4
-; RV32I-NEXT:    sll a4, t5, a4
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    lw t5, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltu t1, ra, .LBB9_23
-; RV32I-NEXT:  .LBB9_25:
-; RV32I-NEXT:    lw a1, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a7, a1, s1
-; RV32I-NEXT:    mv a4, s5
-; RV32I-NEXT:    beqz t1, .LBB9_27
-; RV32I-NEXT:  .LBB9_26:
-; RV32I-NEXT:    mv a4, a7
-; RV32I-NEXT:  .LBB9_27:
-; RV32I-NEXT:    bltz t6, .LBB9_29
-; RV32I-NEXT:  # %bb.28:
-; RV32I-NEXT:    lw s2, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB9_29:
-; RV32I-NEXT:    sltiu a1, a5, 64
-; RV32I-NEXT:    mv a7, t5
-; RV32I-NEXT:    bltz s7, .LBB9_31
-; RV32I-NEXT:  # %bb.30:
-; RV32I-NEXT:    lw a7, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB9_31:
-; RV32I-NEXT:    neg s9, a1
-; RV32I-NEXT:    sltiu a1, s8, 64
-; RV32I-NEXT:    neg t5, a1
-; RV32I-NEXT:    li a1, 128
-; RV32I-NEXT:    bltu a5, a1, .LBB9_33
-; RV32I-NEXT:  # %bb.32:
-; RV32I-NEXT:    and a4, t5, a7
-; RV32I-NEXT:    mv s2, s3
-; RV32I-NEXT:    bnez a5, .LBB9_34
-; RV32I-NEXT:    j .LBB9_35
-; RV32I-NEXT:  .LBB9_33:
-; RV32I-NEXT:    and a1, s9, s2
-; RV32I-NEXT:    or a4, a1, a4
-; RV32I-NEXT:    mv s2, s3
-; RV32I-NEXT:    beqz a5, .LBB9_35
-; RV32I-NEXT:  .LBB9_34:
-; RV32I-NEXT:    mv s2, a4
-; RV32I-NEXT:  .LBB9_35:
-; RV32I-NEXT:    sub a1, ra, a5
-; RV32I-NEXT:    not a7, a1
-; RV32I-NEXT:    bgez s0, .LBB9_37
-; RV32I-NEXT:  # %bb.36:
-; RV32I-NEXT:    sll a1, a6, s11
-; RV32I-NEXT:    srli a4, s3, 1
-; RV32I-NEXT:    srl a4, a4, a7
-; RV32I-NEXT:    or t3, a1, a4
-; RV32I-NEXT:  .LBB9_37:
-; RV32I-NEXT:    slti a1, t6, 0
-; RV32I-NEXT:    neg s3, a1
-; RV32I-NEXT:    slti a1, t2, 0
-; RV32I-NEXT:    neg a4, a1
-; RV32I-NEXT:    sw a7, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw a4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltu a5, ra, .LBB9_39
-; RV32I-NEXT:  # %bb.38:
-; RV32I-NEXT:    lw a1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a4, a4, a1
-; RV32I-NEXT:    j .LBB9_40
-; RV32I-NEXT:  .LBB9_39:
-; RV32I-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, s3, a1
-; RV32I-NEXT:    or a4, a1, t3
-; RV32I-NEXT:  .LBB9_40:
-; RV32I-NEXT:    sw t5, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw t4, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv a7, a0
-; RV32I-NEXT:    beqz a5, .LBB9_42
-; RV32I-NEXT:  # %bb.41:
-; RV32I-NEXT:    mv a7, a4
-; RV32I-NEXT:  .LBB9_42:
-; RV32I-NEXT:    mv t4, t2
-; RV32I-NEXT:    mv ra, s4
-; RV32I-NEXT:    sll s4, a3, s11
-; RV32I-NEXT:    srli a4, s10, 1
-; RV32I-NEXT:    not t5, t1
-; RV32I-NEXT:    bltz s6, .LBB9_44
-; RV32I-NEXT:  # %bb.43:
-; RV32I-NEXT:    mv t2, s1
-; RV32I-NEXT:    j .LBB9_45
-; RV32I-NEXT:  .LBB9_44:
-; RV32I-NEXT:    srl a1, a4, t5
-; RV32I-NEXT:    or t2, s4, a1
-; RV32I-NEXT:  .LBB9_45:
-; RV32I-NEXT:    lw a1, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sll t3, a1, s11
-; RV32I-NEXT:    srli s5, s5, 1
-; RV32I-NEXT:    lw a1, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a1, .LBB9_47
-; RV32I-NEXT:  # %bb.46:
-; RV32I-NEXT:    mv s11, ra
-; RV32I-NEXT:    j .LBB9_48
-; RV32I-NEXT:  .LBB9_47:
-; RV32I-NEXT:    li a1, 192
-; RV32I-NEXT:    sub a1, a1, a5
-; RV32I-NEXT:    not a1, a1
-; RV32I-NEXT:    srl a1, s5, a1
-; RV32I-NEXT:    or s11, t3, a1
-; RV32I-NEXT:  .LBB9_48:
-; RV32I-NEXT:    slti a1, s7, 0
-; RV32I-NEXT:    neg s7, a1
-; RV32I-NEXT:    li a1, 64
-; RV32I-NEXT:    bltu s8, a1, .LBB9_50
-; RV32I-NEXT:  # %bb.49:
-; RV32I-NEXT:    lw a1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slti a1, a1, 0
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    lw s11, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, a1, s11
-; RV32I-NEXT:    mv s11, a3
-; RV32I-NEXT:    bnez s8, .LBB9_51
-; RV32I-NEXT:    j .LBB9_52
-; RV32I-NEXT:  .LBB9_50:
-; RV32I-NEXT:    lw a1, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, s7, a1
-; RV32I-NEXT:    or a1, a1, s11
-; RV32I-NEXT:    mv s11, a3
-; RV32I-NEXT:    beqz s8, .LBB9_52
-; RV32I-NEXT:  .LBB9_51:
-; RV32I-NEXT:    mv s11, a1
-; RV32I-NEXT:  .LBB9_52:
-; RV32I-NEXT:    li a1, 128
-; RV32I-NEXT:    bltu a5, a1, .LBB9_57
-; RV32I-NEXT:  # %bb.53:
-; RV32I-NEXT:    bnez a5, .LBB9_58
-; RV32I-NEXT:  .LBB9_54:
-; RV32I-NEXT:    bltz s0, .LBB9_59
-; RV32I-NEXT:  .LBB9_55:
-; RV32I-NEXT:    bltz s6, .LBB9_60
-; RV32I-NEXT:  .LBB9_56:
-; RV32I-NEXT:    mv a4, ra
-; RV32I-NEXT:    j .LBB9_61
-; RV32I-NEXT:  .LBB9_57:
-; RV32I-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, a1, t2
-; RV32I-NEXT:    or s11, a7, a1
-; RV32I-NEXT:    beqz a5, .LBB9_54
-; RV32I-NEXT:  .LBB9_58:
-; RV32I-NEXT:    mv a0, s11
-; RV32I-NEXT:    bgez s0, .LBB9_55
-; RV32I-NEXT:  .LBB9_59:
-; RV32I-NEXT:    lw a1, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    srl a1, a4, a1
-; RV32I-NEXT:    or s1, s4, a1
-; RV32I-NEXT:    bgez s6, .LBB9_56
-; RV32I-NEXT:  .LBB9_60:
-; RV32I-NEXT:    srl a1, s5, t5
-; RV32I-NEXT:    or a4, t3, a1
-; RV32I-NEXT:  .LBB9_61:
-; RV32I-NEXT:    lw t5, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    li a1, 64
-; RV32I-NEXT:    bltu t1, a1, .LBB9_65
-; RV32I-NEXT:  # %bb.62:
-; RV32I-NEXT:    bnez t1, .LBB9_66
-; RV32I-NEXT:  .LBB9_63:
-; RV32I-NEXT:    li a1, 128
-; RV32I-NEXT:    bltu a5, a1, .LBB9_67
-; RV32I-NEXT:  .LBB9_64:
-; RV32I-NEXT:    lw t2, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, s7, t2
-; RV32I-NEXT:    lw a4, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, a4, a1
-; RV32I-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bnez a5, .LBB9_68
-; RV32I-NEXT:    j .LBB9_69
-; RV32I-NEXT:  .LBB9_65:
-; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slti a1, a1, 0
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    lw t2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, a1, t2
-; RV32I-NEXT:    or s1, a4, a1
-; RV32I-NEXT:    beqz t1, .LBB9_63
-; RV32I-NEXT:  .LBB9_66:
-; RV32I-NEXT:    sw s1, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    li a1, 128
-; RV32I-NEXT:    bgeu a5, a1, .LBB9_64
-; RV32I-NEXT:  .LBB9_67:
-; RV32I-NEXT:    lw a1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, s3, a1
-; RV32I-NEXT:    and a1, s9, a1
-; RV32I-NEXT:    lw a4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    lw t2, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    beqz a5, .LBB9_69
-; RV32I-NEXT:  .LBB9_68:
-; RV32I-NEXT:    mv a6, a1
-; RV32I-NEXT:  .LBB9_69:
-; RV32I-NEXT:    mv a4, t5
-; RV32I-NEXT:    bgez t4, .LBB9_76
-; RV32I-NEXT:  # %bb.70:
-; RV32I-NEXT:    bgez t6, .LBB9_77
-; RV32I-NEXT:  .LBB9_71:
-; RV32I-NEXT:    li t1, 64
-; RV32I-NEXT:    bltu a5, t1, .LBB9_78
-; RV32I-NEXT:  .LBB9_72:
-; RV32I-NEXT:    bnez a5, .LBB9_79
-; RV32I-NEXT:  .LBB9_73:
-; RV32I-NEXT:    bltz s0, .LBB9_80
-; RV32I-NEXT:  .LBB9_74:
-; RV32I-NEXT:    sltiu a4, a5, 128
-; RV32I-NEXT:    bltu a5, t1, .LBB9_81
-; RV32I-NEXT:  .LBB9_75:
-; RV32I-NEXT:    lw a1, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and t1, a1, t2
-; RV32I-NEXT:    neg a7, a4
-; RV32I-NEXT:    bnez a5, .LBB9_82
-; RV32I-NEXT:    j .LBB9_83
-; RV32I-NEXT:  .LBB9_76:
-; RV32I-NEXT:    mv a4, t2
-; RV32I-NEXT:    bltz t6, .LBB9_71
-; RV32I-NEXT:  .LBB9_77:
-; RV32I-NEXT:    lw a1, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a1, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    li t1, 64
-; RV32I-NEXT:    bgeu a5, t1, .LBB9_72
-; RV32I-NEXT:  .LBB9_78:
-; RV32I-NEXT:    and a1, s1, ra
-; RV32I-NEXT:    lw a4, 32(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, a1
-; RV32I-NEXT:    beqz a5, .LBB9_73
-; RV32I-NEXT:  .LBB9_79:
-; RV32I-NEXT:    mv s10, a4
-; RV32I-NEXT:    bgez s0, .LBB9_74
-; RV32I-NEXT:  .LBB9_80:
-; RV32I-NEXT:    lw a1, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    srl a1, s5, a1
-; RV32I-NEXT:    or ra, t3, a1
-; RV32I-NEXT:    sltiu a4, a5, 128
-; RV32I-NEXT:    bgeu a5, t1, .LBB9_75
-; RV32I-NEXT:  .LBB9_81:
-; RV32I-NEXT:    lw a1, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, s3, a1
-; RV32I-NEXT:    or t1, a1, ra
-; RV32I-NEXT:    neg a7, a4
-; RV32I-NEXT:    beqz a5, .LBB9_83
-; RV32I-NEXT:  .LBB9_82:
-; RV32I-NEXT:    mv a3, t1
-; RV32I-NEXT:  .LBB9_83:
-; RV32I-NEXT:    and a4, a7, s10
-; RV32I-NEXT:    and a3, a7, a3
-; RV32I-NEXT:    bltz t6, .LBB9_85
-; RV32I-NEXT:  # %bb.84:
-; RV32I-NEXT:    mv t5, t2
-; RV32I-NEXT:  .LBB9_85:
-; RV32I-NEXT:    and a1, a7, t5
-; RV32I-NEXT:    and a1, a1, s9
-; RV32I-NEXT:    and a5, s3, t2
-; RV32I-NEXT:    and a5, a7, a5
-; RV32I-NEXT:    and a5, a5, s9
+; RV32I-NEXT:    lbu a3, 31(a0)
+; RV32I-NEXT:    lbu a4, 30(a0)
+; RV32I-NEXT:    lbu a0, 29(a0)
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sb a3, 59(sp)
+; RV32I-NEXT:    sb a4, 58(sp)
+; RV32I-NEXT:    sb a0, 57(sp)
+; RV32I-NEXT:    sb a5, 56(sp)
+; RV32I-NEXT:    sb a6, 55(sp)
+; RV32I-NEXT:    sb a7, 54(sp)
+; RV32I-NEXT:    sb zero, 91(sp)
+; RV32I-NEXT:    sb zero, 90(sp)
+; RV32I-NEXT:    sb zero, 89(sp)
+; RV32I-NEXT:    sb zero, 88(sp)
+; RV32I-NEXT:    sb zero, 87(sp)
+; RV32I-NEXT:    sb zero, 86(sp)
+; RV32I-NEXT:    sb zero, 85(sp)
+; RV32I-NEXT:    sb zero, 84(sp)
+; RV32I-NEXT:    sb zero, 83(sp)
+; RV32I-NEXT:    sb zero, 82(sp)
+; RV32I-NEXT:    sb zero, 81(sp)
+; RV32I-NEXT:    sb zero, 80(sp)
+; RV32I-NEXT:    sb zero, 79(sp)
+; RV32I-NEXT:    sb zero, 78(sp)
+; RV32I-NEXT:    sb zero, 77(sp)
+; RV32I-NEXT:    sb zero, 76(sp)
+; RV32I-NEXT:    sb zero, 75(sp)
+; RV32I-NEXT:    sb zero, 74(sp)
+; RV32I-NEXT:    sb zero, 73(sp)
+; RV32I-NEXT:    sb zero, 72(sp)
+; RV32I-NEXT:    sb zero, 71(sp)
+; RV32I-NEXT:    sb zero, 70(sp)
+; RV32I-NEXT:    sb zero, 69(sp)
+; RV32I-NEXT:    sb zero, 68(sp)
+; RV32I-NEXT:    sb zero, 67(sp)
+; RV32I-NEXT:    sb zero, 66(sp)
+; RV32I-NEXT:    sb zero, 65(sp)
+; RV32I-NEXT:    sb zero, 64(sp)
+; RV32I-NEXT:    sb zero, 63(sp)
+; RV32I-NEXT:    sb zero, 62(sp)
+; RV32I-NEXT:    sb zero, 61(sp)
+; RV32I-NEXT:    sb zero, 60(sp)
+; RV32I-NEXT:    sb t0, 53(sp)
+; RV32I-NEXT:    sb ra, 52(sp)
+; RV32I-NEXT:    sb s11, 51(sp)
+; RV32I-NEXT:    sb s10, 50(sp)
+; RV32I-NEXT:    sb s9, 49(sp)
+; RV32I-NEXT:    sb s8, 48(sp)
+; RV32I-NEXT:    sb s7, 47(sp)
+; RV32I-NEXT:    sb s6, 46(sp)
+; RV32I-NEXT:    sb s5, 45(sp)
+; RV32I-NEXT:    sb s4, 44(sp)
+; RV32I-NEXT:    sb s3, 43(sp)
+; RV32I-NEXT:    sb s2, 42(sp)
+; RV32I-NEXT:    sb s1, 41(sp)
+; RV32I-NEXT:    sb s0, 40(sp)
+; RV32I-NEXT:    sb t6, 39(sp)
+; RV32I-NEXT:    sb t5, 38(sp)
+; RV32I-NEXT:    sb t4, 37(sp)
+; RV32I-NEXT:    sb t3, 36(sp)
+; RV32I-NEXT:    sb t2, 35(sp)
+; RV32I-NEXT:    sb t1, 34(sp)
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 33(sp)
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 32(sp)
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 31(sp)
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 30(sp)
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 29(sp)
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 28(sp)
+; RV32I-NEXT:    andi a1, a1, 31
+; RV32I-NEXT:    addi a0, sp, 28
+; RV32I-NEXT:    add a5, a0, a1
+; RV32I-NEXT:    lbu a0, 6(a5)
+; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 7(a5)
+; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 4(a5)
+; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 5(a5)
+; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 0(a5)
+; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a7, 1(a5)
+; RV32I-NEXT:    lbu t0, 2(a5)
+; RV32I-NEXT:    lbu t1, 3(a5)
+; RV32I-NEXT:    lbu t2, 14(a5)
+; RV32I-NEXT:    lbu t3, 15(a5)
+; RV32I-NEXT:    lbu t4, 12(a5)
+; RV32I-NEXT:    lbu t5, 13(a5)
+; RV32I-NEXT:    lbu t6, 10(a5)
+; RV32I-NEXT:    lbu s0, 11(a5)
+; RV32I-NEXT:    lbu s1, 8(a5)
+; RV32I-NEXT:    lbu s2, 9(a5)
+; RV32I-NEXT:    lbu s3, 22(a5)
+; RV32I-NEXT:    lbu s4, 23(a5)
+; RV32I-NEXT:    lbu s5, 20(a5)
+; RV32I-NEXT:    lbu s6, 21(a5)
+; RV32I-NEXT:    lbu s7, 18(a5)
+; RV32I-NEXT:    lbu s8, 19(a5)
+; RV32I-NEXT:    lbu s9, 16(a5)
+; RV32I-NEXT:    lbu s10, 17(a5)
+; RV32I-NEXT:    lbu s11, 30(a5)
+; RV32I-NEXT:    lbu ra, 31(a5)
+; RV32I-NEXT:    lbu a6, 28(a5)
+; RV32I-NEXT:    lbu a4, 29(a5)
+; RV32I-NEXT:    lbu a0, 25(a5)
+; RV32I-NEXT:    lbu a1, 24(a5)
+; RV32I-NEXT:    lbu a3, 27(a5)
+; RV32I-NEXT:    lbu a5, 26(a5)
+; RV32I-NEXT:    sb a0, 25(a2)
 ; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    srli a7, a1, 24
-; RV32I-NEXT:    sb a7, 27(a2)
-; RV32I-NEXT:    srli a7, a1, 16
-; RV32I-NEXT:    sb a7, 26(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 25(a2)
-; RV32I-NEXT:    srli a1, a5, 24
-; RV32I-NEXT:    sb a1, 31(a2)
-; RV32I-NEXT:    srli a1, a5, 16
-; RV32I-NEXT:    sb a1, 30(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 29(a2)
-; RV32I-NEXT:    sb a4, 16(a2)
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 19(a2)
-; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 18(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 17(a2)
-; RV32I-NEXT:    sb a3, 20(a2)
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 23(a2)
-; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 22(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 21(a2)
-; RV32I-NEXT:    sb t0, 0(a2)
-; RV32I-NEXT:    sb a6, 12(a2)
-; RV32I-NEXT:    srli a1, t0, 24
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a1, t0, 16
-; RV32I-NEXT:    sb a1, 2(a2)
-; RV32I-NEXT:    srli a1, t0, 8
-; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    sb s2, 8(a2)
-; RV32I-NEXT:    srli a1, a6, 24
-; RV32I-NEXT:    sb a1, 15(a2)
-; RV32I-NEXT:    srli a1, a6, 16
-; RV32I-NEXT:    sb a1, 14(a2)
-; RV32I-NEXT:    srli a1, a6, 8
-; RV32I-NEXT:    sb a1, 13(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    sb a1, 6(a2)
-; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a3, 27(a2)
+; RV32I-NEXT:    sb a5, 26(a2)
+; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    sb a6, 28(a2)
+; RV32I-NEXT:    sb ra, 31(a2)
+; RV32I-NEXT:    sb s11, 30(a2)
+; RV32I-NEXT:    sb s10, 17(a2)
+; RV32I-NEXT:    sb s9, 16(a2)
+; RV32I-NEXT:    sb s8, 19(a2)
+; RV32I-NEXT:    sb s7, 18(a2)
+; RV32I-NEXT:    sb s6, 21(a2)
+; RV32I-NEXT:    sb s5, 20(a2)
+; RV32I-NEXT:    sb s4, 23(a2)
+; RV32I-NEXT:    sb s3, 22(a2)
+; RV32I-NEXT:    sb s2, 9(a2)
+; RV32I-NEXT:    sb s1, 8(a2)
+; RV32I-NEXT:    sb s0, 11(a2)
+; RV32I-NEXT:    sb t6, 10(a2)
+; RV32I-NEXT:    sb t5, 13(a2)
+; RV32I-NEXT:    sb t4, 12(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t1, 3(a2)
+; RV32I-NEXT:    sb t0, 2(a2)
+; RV32I-NEXT:    sb a7, 1(a2)
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, s2, 24
-; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a0, s2, 16
-; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    srli a0, s2, 8
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    lw ra, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 112
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 144
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -2325,817 +1719,438 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 17(a0)
-; RV64I-NEXT:    lbu a4, 16(a0)
-; RV64I-NEXT:    lbu a5, 18(a0)
-; RV64I-NEXT:    lbu a6, 19(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a4, 21(a0)
-; RV64I-NEXT:    lbu a5, 20(a0)
-; RV64I-NEXT:    lbu a7, 22(a0)
-; RV64I-NEXT:    lbu t0, 23(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, t0, a4
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a3, a3, a6
-; RV64I-NEXT:    lbu a4, 25(a0)
-; RV64I-NEXT:    lbu a5, 24(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a7, 27(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or a5, a7, a4
-; RV64I-NEXT:    lbu a4, 29(a0)
-; RV64I-NEXT:    lbu a6, 28(a0)
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu t0, 31(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or t0, t0, a4
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a6, 0(a0)
-; RV64I-NEXT:    lbu a7, 2(a0)
-; RV64I-NEXT:    lbu t1, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t2, 6(a0)
-; RV64I-NEXT:    lbu t3, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or a6, t2, a6
-; RV64I-NEXT:    or a6, t3, a6
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or a4, a4, t1
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    lbu a7, 8(a0)
-; RV64I-NEXT:    lbu t1, 10(a0)
-; RV64I-NEXT:    lbu t2, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    lbu t1, 12(a0)
-; RV64I-NEXT:    lbu t3, 14(a0)
-; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t1
-; RV64I-NEXT:    slli t3, t3, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a7, t3, a7
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    or t1, a0, t2
-; RV64I-NEXT:    lbu a0, 5(a1)
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
-; RV64I-NEXT:    lbu t2, 7(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a0, a7, a0
-; RV64I-NEXT:    or a0, t2, a0
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t2, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a6, t2, a6
-; RV64I-NEXT:    or a1, a1, a6
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a0, a0, 35
-; RV64I-NEXT:    or a1, a0, a1
-; RV64I-NEXT:    sll a0, t1, a1
-; RV64I-NEXT:    not t4, a1
-; RV64I-NEXT:    srli a6, a4, 1
-; RV64I-NEXT:    srl a6, a6, t4
-; RV64I-NEXT:    or a6, a0, a6
-; RV64I-NEXT:    addi t2, a1, -192
-; RV64I-NEXT:    sll a7, a4, a1
-; RV64I-NEXT:    mv t3, a6
-; RV64I-NEXT:    bltz t2, .LBB10_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    mv t3, a7
-; RV64I-NEXT:  .LBB10_2:
-; RV64I-NEXT:    or a0, t0, a5
-; RV64I-NEXT:    addi a5, a1, -64
-; RV64I-NEXT:    sll t0, a3, a1
-; RV64I-NEXT:    bltz a5, .LBB10_4
-; RV64I-NEXT:  # %bb.3:
-; RV64I-NEXT:    mv s1, t0
-; RV64I-NEXT:    j .LBB10_5
-; RV64I-NEXT:  .LBB10_4:
-; RV64I-NEXT:    sll t5, a0, a1
-; RV64I-NEXT:    srli t6, a3, 1
-; RV64I-NEXT:    srl t4, t6, t4
-; RV64I-NEXT:    or s1, t5, t4
-; RV64I-NEXT:  .LBB10_5:
-; RV64I-NEXT:    negw t6, a1
-; RV64I-NEXT:    srl t4, t1, t6
-; RV64I-NEXT:    li s0, 64
-; RV64I-NEXT:    li t5, 128
-; RV64I-NEXT:    sub s0, s0, a1
-; RV64I-NEXT:    bltu a1, t5, .LBB10_11
-; RV64I-NEXT:  # %bb.6:
-; RV64I-NEXT:    bnez a1, .LBB10_12
-; RV64I-NEXT:  .LBB10_7:
-; RV64I-NEXT:    bgez s0, .LBB10_9
-; RV64I-NEXT:  .LBB10_8:
-; RV64I-NEXT:    srl a4, a4, t6
-; RV64I-NEXT:    slli t1, t1, 1
-; RV64I-NEXT:    subw t3, t5, a1
-; RV64I-NEXT:    not t3, t3
-; RV64I-NEXT:    sll t1, t1, t3
-; RV64I-NEXT:    or t4, a4, t1
-; RV64I-NEXT:  .LBB10_9:
-; RV64I-NEXT:    slti a4, a5, 0
-; RV64I-NEXT:    neg a4, a4
-; RV64I-NEXT:    bltu a1, t5, .LBB10_13
-; RV64I-NEXT:  # %bb.10:
-; RV64I-NEXT:    slti t0, t2, 0
-; RV64I-NEXT:    neg t0, t0
-; RV64I-NEXT:    and t0, t0, a7
-; RV64I-NEXT:    bnez a1, .LBB10_14
-; RV64I-NEXT:    j .LBB10_15
-; RV64I-NEXT:  .LBB10_11:
-; RV64I-NEXT:    slti t3, s0, 0
-; RV64I-NEXT:    neg t3, t3
-; RV64I-NEXT:    and t3, t3, t4
-; RV64I-NEXT:    or t3, s1, t3
-; RV64I-NEXT:    beqz a1, .LBB10_7
-; RV64I-NEXT:  .LBB10_12:
-; RV64I-NEXT:    mv a0, t3
-; RV64I-NEXT:    bltz s0, .LBB10_8
-; RV64I-NEXT:    j .LBB10_9
-; RV64I-NEXT:  .LBB10_13:
-; RV64I-NEXT:    and t0, a4, t0
-; RV64I-NEXT:    or t0, t0, t4
-; RV64I-NEXT:    beqz a1, .LBB10_15
-; RV64I-NEXT:  .LBB10_14:
-; RV64I-NEXT:    mv a3, t0
-; RV64I-NEXT:  .LBB10_15:
-; RV64I-NEXT:    bltz a5, .LBB10_17
-; RV64I-NEXT:  # %bb.16:
-; RV64I-NEXT:    mv a6, a7
-; RV64I-NEXT:  .LBB10_17:
-; RV64I-NEXT:    sltiu a1, a1, 128
-; RV64I-NEXT:    neg a1, a1
-; RV64I-NEXT:    and a5, a1, a6
-; RV64I-NEXT:    and a4, a4, a7
-; RV64I-NEXT:    and a1, a1, a4
-; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a5, 8(a2)
-; RV64I-NEXT:    srli a4, a1, 56
-; RV64I-NEXT:    sb a4, 7(a2)
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    sb a4, 6(a2)
-; RV64I-NEXT:    srli a4, a1, 40
-; RV64I-NEXT:    sb a4, 5(a2)
-; RV64I-NEXT:    srli a4, a1, 32
-; RV64I-NEXT:    sb a4, 4(a2)
-; RV64I-NEXT:    srli a4, a1, 24
-; RV64I-NEXT:    sb a4, 3(a2)
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    sb a4, 2(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 1(a2)
-; RV64I-NEXT:    srli a1, a5, 56
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    srli a1, a5, 48
-; RV64I-NEXT:    sb a1, 14(a2)
-; RV64I-NEXT:    srli a1, a5, 40
-; RV64I-NEXT:    sb a1, 13(a2)
-; RV64I-NEXT:    srli a1, a5, 32
-; RV64I-NEXT:    sb a1, 12(a2)
-; RV64I-NEXT:    srli a1, a5, 24
-; RV64I-NEXT:    sb a1, 11(a2)
-; RV64I-NEXT:    srli a1, a5, 16
-; RV64I-NEXT:    sb a1, 10(a2)
-; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 9(a2)
-; RV64I-NEXT:    sb a0, 24(a2)
-; RV64I-NEXT:    sb a3, 16(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    sb a1, 31(a2)
-; RV64I-NEXT:    srli a1, a0, 48
-; RV64I-NEXT:    sb a1, 30(a2)
-; RV64I-NEXT:    srli a1, a0, 40
-; RV64I-NEXT:    sb a1, 29(a2)
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    sb a1, 28(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    sb a1, 27(a2)
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    sb a1, 26(a2)
-; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    sb a0, 25(a2)
-; RV64I-NEXT:    srli a0, a3, 56
+; RV64I-NEXT:    addi sp, sp, -224
+; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 2(a0)
+; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 3(a0)
+; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 4(a0)
+; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 5(a0)
+; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    lbu ra, 24(a0)
+; RV64I-NEXT:    lbu t0, 25(a0)
+; RV64I-NEXT:    lbu a7, 26(a0)
+; RV64I-NEXT:    lbu a6, 27(a0)
+; RV64I-NEXT:    lbu a5, 28(a0)
+; RV64I-NEXT:    lbu a3, 31(a0)
+; RV64I-NEXT:    lbu a4, 30(a0)
+; RV64I-NEXT:    lbu a0, 29(a0)
+; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    sb a3, 119(sp)
+; RV64I-NEXT:    sb a4, 118(sp)
+; RV64I-NEXT:    sb a0, 117(sp)
+; RV64I-NEXT:    sb a5, 116(sp)
+; RV64I-NEXT:    sb a6, 115(sp)
+; RV64I-NEXT:    sb a7, 114(sp)
+; RV64I-NEXT:    sb zero, 87(sp)
+; RV64I-NEXT:    sb zero, 86(sp)
+; RV64I-NEXT:    sb zero, 85(sp)
+; RV64I-NEXT:    sb zero, 84(sp)
+; RV64I-NEXT:    sb zero, 83(sp)
+; RV64I-NEXT:    sb zero, 82(sp)
+; RV64I-NEXT:    sb zero, 81(sp)
+; RV64I-NEXT:    sb zero, 80(sp)
+; RV64I-NEXT:    sb zero, 79(sp)
+; RV64I-NEXT:    sb zero, 78(sp)
+; RV64I-NEXT:    sb zero, 77(sp)
+; RV64I-NEXT:    sb zero, 76(sp)
+; RV64I-NEXT:    sb zero, 75(sp)
+; RV64I-NEXT:    sb zero, 74(sp)
+; RV64I-NEXT:    sb zero, 73(sp)
+; RV64I-NEXT:    sb zero, 72(sp)
+; RV64I-NEXT:    sb zero, 71(sp)
+; RV64I-NEXT:    sb zero, 70(sp)
+; RV64I-NEXT:    sb zero, 69(sp)
+; RV64I-NEXT:    sb zero, 68(sp)
+; RV64I-NEXT:    sb zero, 67(sp)
+; RV64I-NEXT:    sb zero, 66(sp)
+; RV64I-NEXT:    sb zero, 65(sp)
+; RV64I-NEXT:    sb zero, 64(sp)
+; RV64I-NEXT:    sb zero, 63(sp)
+; RV64I-NEXT:    sb zero, 62(sp)
+; RV64I-NEXT:    sb zero, 61(sp)
+; RV64I-NEXT:    sb zero, 60(sp)
+; RV64I-NEXT:    sb zero, 59(sp)
+; RV64I-NEXT:    sb zero, 58(sp)
+; RV64I-NEXT:    sb zero, 57(sp)
+; RV64I-NEXT:    sb zero, 56(sp)
+; RV64I-NEXT:    sb t0, 113(sp)
+; RV64I-NEXT:    sb ra, 112(sp)
+; RV64I-NEXT:    sb s11, 111(sp)
+; RV64I-NEXT:    sb s10, 110(sp)
+; RV64I-NEXT:    sb s9, 109(sp)
+; RV64I-NEXT:    sb s8, 108(sp)
+; RV64I-NEXT:    sb s7, 107(sp)
+; RV64I-NEXT:    sb s6, 106(sp)
+; RV64I-NEXT:    sb s5, 105(sp)
+; RV64I-NEXT:    sb s4, 104(sp)
+; RV64I-NEXT:    sb s3, 103(sp)
+; RV64I-NEXT:    sb s2, 102(sp)
+; RV64I-NEXT:    sb s1, 101(sp)
+; RV64I-NEXT:    sb s0, 100(sp)
+; RV64I-NEXT:    sb t6, 99(sp)
+; RV64I-NEXT:    sb t5, 98(sp)
+; RV64I-NEXT:    sb t4, 97(sp)
+; RV64I-NEXT:    sb t3, 96(sp)
+; RV64I-NEXT:    sb t2, 95(sp)
+; RV64I-NEXT:    sb t1, 94(sp)
+; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 93(sp)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 92(sp)
+; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 91(sp)
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 90(sp)
+; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 89(sp)
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 88(sp)
+; RV64I-NEXT:    andi a1, a1, 31
+; RV64I-NEXT:    addi a0, sp, 88
+; RV64I-NEXT:    sub a5, a0, a1
+; RV64I-NEXT:    lbu a0, 8(a5)
+; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 9(a5)
+; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 10(a5)
+; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 11(a5)
+; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 12(a5)
+; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a7, 13(a5)
+; RV64I-NEXT:    lbu t0, 14(a5)
+; RV64I-NEXT:    lbu t1, 15(a5)
+; RV64I-NEXT:    lbu t2, 0(a5)
+; RV64I-NEXT:    lbu t3, 1(a5)
+; RV64I-NEXT:    lbu t4, 2(a5)
+; RV64I-NEXT:    lbu t5, 3(a5)
+; RV64I-NEXT:    lbu t6, 4(a5)
+; RV64I-NEXT:    lbu s0, 5(a5)
+; RV64I-NEXT:    lbu s1, 6(a5)
+; RV64I-NEXT:    lbu s2, 7(a5)
+; RV64I-NEXT:    lbu s3, 24(a5)
+; RV64I-NEXT:    lbu s4, 25(a5)
+; RV64I-NEXT:    lbu s5, 26(a5)
+; RV64I-NEXT:    lbu s6, 27(a5)
+; RV64I-NEXT:    lbu s7, 28(a5)
+; RV64I-NEXT:    lbu s8, 29(a5)
+; RV64I-NEXT:    lbu s9, 30(a5)
+; RV64I-NEXT:    lbu s10, 31(a5)
+; RV64I-NEXT:    lbu s11, 16(a5)
+; RV64I-NEXT:    lbu ra, 17(a5)
+; RV64I-NEXT:    lbu a6, 18(a5)
+; RV64I-NEXT:    lbu a4, 19(a5)
+; RV64I-NEXT:    lbu a0, 23(a5)
+; RV64I-NEXT:    lbu a1, 22(a5)
+; RV64I-NEXT:    lbu a3, 21(a5)
+; RV64I-NEXT:    lbu a5, 20(a5)
 ; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    srli a0, a3, 48
-; RV64I-NEXT:    sb a0, 22(a2)
-; RV64I-NEXT:    srli a0, a3, 40
-; RV64I-NEXT:    sb a0, 21(a2)
-; RV64I-NEXT:    srli a0, a3, 32
-; RV64I-NEXT:    sb a0, 20(a2)
-; RV64I-NEXT:    srli a0, a3, 24
-; RV64I-NEXT:    sb a0, 19(a2)
-; RV64I-NEXT:    srli a0, a3, 16
-; RV64I-NEXT:    sb a0, 18(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 17(a2)
-; RV64I-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    sb a3, 21(a2)
+; RV64I-NEXT:    sb a5, 20(a2)
+; RV64I-NEXT:    sb a4, 19(a2)
+; RV64I-NEXT:    sb a6, 18(a2)
+; RV64I-NEXT:    sb ra, 17(a2)
+; RV64I-NEXT:    sb s11, 16(a2)
+; RV64I-NEXT:    sb s10, 31(a2)
+; RV64I-NEXT:    sb s9, 30(a2)
+; RV64I-NEXT:    sb s8, 29(a2)
+; RV64I-NEXT:    sb s7, 28(a2)
+; RV64I-NEXT:    sb s6, 27(a2)
+; RV64I-NEXT:    sb s5, 26(a2)
+; RV64I-NEXT:    sb s4, 25(a2)
+; RV64I-NEXT:    sb s3, 24(a2)
+; RV64I-NEXT:    sb s2, 7(a2)
+; RV64I-NEXT:    sb s1, 6(a2)
+; RV64I-NEXT:    sb s0, 5(a2)
+; RV64I-NEXT:    sb t6, 4(a2)
+; RV64I-NEXT:    sb t5, 3(a2)
+; RV64I-NEXT:    sb t4, 2(a2)
+; RV64I-NEXT:    sb t3, 1(a2)
+; RV64I-NEXT:    sb t2, 0(a2)
+; RV64I-NEXT:    sb t1, 15(a2)
+; RV64I-NEXT:    sb t0, 14(a2)
+; RV64I-NEXT:    sb a7, 13(a2)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 12(a2)
+; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 11(a2)
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 10(a2)
+; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 224
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a4, 25(a0)
-; RV32I-NEXT:    lbu t2, 26(a0)
-; RV32I-NEXT:    lbu t5, 27(a0)
-; RV32I-NEXT:    lbu a6, 28(a0)
-; RV32I-NEXT:    lbu t1, 29(a0)
-; RV32I-NEXT:    lbu t3, 30(a0)
-; RV32I-NEXT:    lbu t4, 31(a0)
-; RV32I-NEXT:    lbu t6, 16(a0)
-; RV32I-NEXT:    lbu t0, 17(a0)
-; RV32I-NEXT:    lbu s1, 18(a0)
-; RV32I-NEXT:    lbu s3, 19(a0)
-; RV32I-NEXT:    lbu s0, 20(a0)
-; RV32I-NEXT:    lbu s5, 21(a0)
-; RV32I-NEXT:    lbu s6, 22(a0)
-; RV32I-NEXT:    lbu s7, 23(a0)
-; RV32I-NEXT:    lbu a3, 9(a0)
-; RV32I-NEXT:    lbu a5, 8(a0)
-; RV32I-NEXT:    lbu s2, 10(a0)
-; RV32I-NEXT:    lbu s4, 11(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli s4, s4, 24
-; RV32I-NEXT:    or a3, s2, a3
-; RV32I-NEXT:    or a3, s4, a3
-; RV32I-NEXT:    lbu a5, 13(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s8, 15(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s2
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    or a5, s4, a5
-; RV32I-NEXT:    or a5, s8, a5
-; RV32I-NEXT:    sw a5, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a5, 1(a0)
-; RV32I-NEXT:    lbu s2, 0(a0)
-; RV32I-NEXT:    lbu s4, 2(a0)
-; RV32I-NEXT:    lbu s8, 3(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s2
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    or a5, s4, a5
-; RV32I-NEXT:    or s4, s8, a5
-; RV32I-NEXT:    lbu a5, 5(a0)
-; RV32I-NEXT:    lbu s2, 4(a0)
-; RV32I-NEXT:    lbu s8, 6(a0)
-; RV32I-NEXT:    lbu a0, 7(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s2
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a5, s8, a5
-; RV32I-NEXT:    or s8, a0, a5
-; RV32I-NEXT:    lbu a0, 1(a1)
-; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu s2, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a0, s2, a0
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    slli a5, a0, 3
-; RV32I-NEXT:    sll a1, s8, a5
-; RV32I-NEXT:    srli s2, s4, 1
-; RV32I-NEXT:    not a0, a5
-; RV32I-NEXT:    srl s2, s2, a0
-; RV32I-NEXT:    or s10, a1, s2
-; RV32I-NEXT:    addi s2, a5, -224
-; RV32I-NEXT:    sw s4, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sll a1, s4, a5
-; RV32I-NEXT:    sw s10, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz s2, .LBB10_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv s10, a1
-; RV32I-NEXT:  .LBB10_2:
-; RV32I-NEXT:    sw a1, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    slli s2, t0, 8
-; RV32I-NEXT:    slli s11, s1, 16
-; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    slli ra, s5, 8
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s7, s7, 24
-; RV32I-NEXT:    lw a1, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sll a1, a1, a5
-; RV32I-NEXT:    srli t0, a3, 1
-; RV32I-NEXT:    srl s1, t0, a0
-; RV32I-NEXT:    or s5, a1, s1
-; RV32I-NEXT:    addi s9, a5, -160
-; RV32I-NEXT:    sll a1, a3, a5
-; RV32I-NEXT:    sw s5, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz s9, .LBB10_4
-; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv s5, a1
-; RV32I-NEXT:  .LBB10_4:
-; RV32I-NEXT:    sw a1, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a1, t2, 16
-; RV32I-NEXT:    slli t5, t5, 24
-; RV32I-NEXT:    or s1, s2, t6
-; RV32I-NEXT:    or s2, s3, s11
-; RV32I-NEXT:    or s0, ra, s0
-; RV32I-NEXT:    or s3, s7, s6
-; RV32I-NEXT:    neg s7, a5
-; RV32I-NEXT:    srl ra, s8, s7
-; RV32I-NEXT:    li s6, 160
-; RV32I-NEXT:    addi t6, a5, -128
-; RV32I-NEXT:    li t2, 64
-; RV32I-NEXT:    sub s6, s6, a5
-; RV32I-NEXT:    sw s6, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgeu t6, t2, .LBB10_6
-; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    slti s6, s6, 0
-; RV32I-NEXT:    neg s6, s6
-; RV32I-NEXT:    and s6, s6, ra
-; RV32I-NEXT:    or s10, s5, s6
-; RV32I-NEXT:  .LBB10_6:
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a4, a4, a7
-; RV32I-NEXT:    or s5, t5, a1
-; RV32I-NEXT:    or t5, s2, s1
-; RV32I-NEXT:    or s3, s3, s0
-; RV32I-NEXT:    lw s2, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    mv a1, s2
-; RV32I-NEXT:    beqz t6, .LBB10_8
-; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv a1, s10
-; RV32I-NEXT:  .LBB10_8:
-; RV32I-NEXT:    or a7, t1, a6
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    or a6, s5, a4
-; RV32I-NEXT:    sll a4, s3, a5
-; RV32I-NEXT:    srli t1, t5, 1
-; RV32I-NEXT:    srl t1, t1, a0
-; RV32I-NEXT:    or t1, a4, t1
-; RV32I-NEXT:    addi t4, a5, -96
-; RV32I-NEXT:    sll a4, t5, a5
-; RV32I-NEXT:    sw a4, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv a4, t1
-; RV32I-NEXT:    sw t4, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz t4, .LBB10_10
-; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    lw a4, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB10_10:
-; RV32I-NEXT:    or a7, t3, a7
-; RV32I-NEXT:    addi s4, a5, -32
-; RV32I-NEXT:    sll t3, a6, a5
-; RV32I-NEXT:    sw t3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgez s4, .LBB10_12
-; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sll t3, a7, a5
-; RV32I-NEXT:    srli s0, a6, 1
-; RV32I-NEXT:    srl a0, s0, a0
-; RV32I-NEXT:    or t3, t3, a0
-; RV32I-NEXT:  .LBB10_12:
-; RV32I-NEXT:    srl s5, s3, s7
-; RV32I-NEXT:    li a0, 32
-; RV32I-NEXT:    sub s0, a0, a5
-; RV32I-NEXT:    sw s0, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    slti s1, s0, 0
-; RV32I-NEXT:    neg s6, s1
-; RV32I-NEXT:    bgeu a5, t2, .LBB10_14
-; RV32I-NEXT:  # %bb.13:
-; RV32I-NEXT:    and a4, s6, s5
-; RV32I-NEXT:    or a4, t3, a4
-; RV32I-NEXT:  .LBB10_14:
-; RV32I-NEXT:    sw s5, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv t3, a7
-; RV32I-NEXT:    beqz a5, .LBB10_16
-; RV32I-NEXT:  # %bb.15:
-; RV32I-NEXT:    mv t3, a4
-; RV32I-NEXT:  .LBB10_16:
-; RV32I-NEXT:    srl s10, s2, s7
-; RV32I-NEXT:    li a4, 96
-; RV32I-NEXT:    sub s5, a4, a5
-; RV32I-NEXT:    slti a4, s5, 0
-; RV32I-NEXT:    neg a4, a4
-; RV32I-NEXT:    li t2, 128
-; RV32I-NEXT:    sub s11, t2, a5
-; RV32I-NEXT:    sltiu s1, s11, 64
-; RV32I-NEXT:    neg s1, s1
-; RV32I-NEXT:    bgeu a5, t2, .LBB10_18
-; RV32I-NEXT:  # %bb.17:
-; RV32I-NEXT:    and a1, a4, s10
-; RV32I-NEXT:    and a1, s1, a1
-; RV32I-NEXT:    or a1, t3, a1
-; RV32I-NEXT:  .LBB10_18:
-; RV32I-NEXT:    li t2, 64
-; RV32I-NEXT:    sw s1, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    beqz a5, .LBB10_20
-; RV32I-NEXT:  # %bb.19:
-; RV32I-NEXT:    mv a7, a1
-; RV32I-NEXT:  .LBB10_20:
-; RV32I-NEXT:    neg s1, s11
-; RV32I-NEXT:    sub t3, a0, s11
-; RV32I-NEXT:    sll a1, a3, s1
-; RV32I-NEXT:    bltz t3, .LBB10_23
-; RV32I-NEXT:  # %bb.21:
-; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    bgeu s11, t2, .LBB10_24
-; RV32I-NEXT:  .LBB10_22:
-; RV32I-NEXT:    and a4, a4, ra
-; RV32I-NEXT:    or a4, a4, a0
-; RV32I-NEXT:    mv a0, s8
-; RV32I-NEXT:    bnez s11, .LBB10_25
-; RV32I-NEXT:    j .LBB10_26
-; RV32I-NEXT:  .LBB10_23:
-; RV32I-NEXT:    sll a0, s2, s1
-; RV32I-NEXT:    sub s1, t2, s11
-; RV32I-NEXT:    not s1, s1
-; RV32I-NEXT:    srl t0, t0, s1
-; RV32I-NEXT:    or a0, a0, t0
-; RV32I-NEXT:    bltu s11, t2, .LBB10_22
-; RV32I-NEXT:  .LBB10_24:
-; RV32I-NEXT:    and a4, s6, s10
-; RV32I-NEXT:    mv a0, s8
-; RV32I-NEXT:    beqz s11, .LBB10_26
-; RV32I-NEXT:  .LBB10_25:
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:  .LBB10_26:
-; RV32I-NEXT:    bltz s4, .LBB10_28
-; RV32I-NEXT:  # %bb.27:
-; RV32I-NEXT:    lw t1, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB10_28:
-; RV32I-NEXT:    sw s6, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sltiu t0, a5, 64
-; RV32I-NEXT:    lw a4, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    mv s0, s9
-; RV32I-NEXT:    bltz s9, .LBB10_30
-; RV32I-NEXT:  # %bb.29:
-; RV32I-NEXT:    lw a4, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB10_30:
-; RV32I-NEXT:    neg s9, t0
-; RV32I-NEXT:    sltiu t0, t6, 64
-; RV32I-NEXT:    neg s1, t0
-; RV32I-NEXT:    li t0, 128
-; RV32I-NEXT:    bltu a5, t0, .LBB10_32
-; RV32I-NEXT:  # %bb.31:
-; RV32I-NEXT:    and a4, s1, a4
-; RV32I-NEXT:    mv t0, s3
-; RV32I-NEXT:    bnez a5, .LBB10_33
-; RV32I-NEXT:    j .LBB10_34
-; RV32I-NEXT:  .LBB10_32:
-; RV32I-NEXT:    and a4, s9, t1
-; RV32I-NEXT:    or a4, a4, a0
-; RV32I-NEXT:    mv t0, s3
-; RV32I-NEXT:    beqz a5, .LBB10_34
-; RV32I-NEXT:  .LBB10_33:
-; RV32I-NEXT:    mv t0, a4
-; RV32I-NEXT:  .LBB10_34:
-; RV32I-NEXT:    srl t1, a3, s7
-; RV32I-NEXT:    slli a4, s2, 1
-; RV32I-NEXT:    sub a0, t2, a5
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, -144
+; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 2(a0)
+; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 3(a0)
+; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 5(a0)
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    lbu ra, 24(a0)
+; RV32I-NEXT:    lbu t0, 25(a0)
+; RV32I-NEXT:    lbu a7, 26(a0)
+; RV32I-NEXT:    lbu a6, 27(a0)
+; RV32I-NEXT:    lbu a5, 28(a0)
+; RV32I-NEXT:    lbu a3, 31(a0)
+; RV32I-NEXT:    lbu a4, 30(a0)
+; RV32I-NEXT:    lbu a0, 29(a0)
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sb a3, 91(sp)
+; RV32I-NEXT:    sb a4, 90(sp)
+; RV32I-NEXT:    sb a0, 89(sp)
+; RV32I-NEXT:    sb a5, 88(sp)
+; RV32I-NEXT:    sb a6, 87(sp)
+; RV32I-NEXT:    sb a7, 86(sp)
+; RV32I-NEXT:    sb zero, 59(sp)
+; RV32I-NEXT:    sb zero, 58(sp)
+; RV32I-NEXT:    sb zero, 57(sp)
+; RV32I-NEXT:    sb zero, 56(sp)
+; RV32I-NEXT:    sb zero, 55(sp)
+; RV32I-NEXT:    sb zero, 54(sp)
+; RV32I-NEXT:    sb zero, 53(sp)
+; RV32I-NEXT:    sb zero, 52(sp)
+; RV32I-NEXT:    sb zero, 51(sp)
+; RV32I-NEXT:    sb zero, 50(sp)
+; RV32I-NEXT:    sb zero, 49(sp)
+; RV32I-NEXT:    sb zero, 48(sp)
+; RV32I-NEXT:    sb zero, 47(sp)
+; RV32I-NEXT:    sb zero, 46(sp)
+; RV32I-NEXT:    sb zero, 45(sp)
+; RV32I-NEXT:    sb zero, 44(sp)
+; RV32I-NEXT:    sb zero, 43(sp)
+; RV32I-NEXT:    sb zero, 42(sp)
+; RV32I-NEXT:    sb zero, 41(sp)
+; RV32I-NEXT:    sb zero, 40(sp)
+; RV32I-NEXT:    sb zero, 39(sp)
+; RV32I-NEXT:    sb zero, 38(sp)
+; RV32I-NEXT:    sb zero, 37(sp)
+; RV32I-NEXT:    sb zero, 36(sp)
+; RV32I-NEXT:    sb zero, 35(sp)
+; RV32I-NEXT:    sb zero, 34(sp)
+; RV32I-NEXT:    sb zero, 33(sp)
+; RV32I-NEXT:    sb zero, 32(sp)
+; RV32I-NEXT:    sb zero, 31(sp)
+; RV32I-NEXT:    sb zero, 30(sp)
+; RV32I-NEXT:    sb zero, 29(sp)
+; RV32I-NEXT:    sb zero, 28(sp)
+; RV32I-NEXT:    sb t0, 85(sp)
+; RV32I-NEXT:    sb ra, 84(sp)
+; RV32I-NEXT:    sb s11, 83(sp)
+; RV32I-NEXT:    sb s10, 82(sp)
+; RV32I-NEXT:    sb s9, 81(sp)
+; RV32I-NEXT:    sb s8, 80(sp)
+; RV32I-NEXT:    sb s7, 79(sp)
+; RV32I-NEXT:    sb s6, 78(sp)
+; RV32I-NEXT:    sb s5, 77(sp)
+; RV32I-NEXT:    sb s4, 76(sp)
+; RV32I-NEXT:    sb s3, 75(sp)
+; RV32I-NEXT:    sb s2, 74(sp)
+; RV32I-NEXT:    sb s1, 73(sp)
+; RV32I-NEXT:    sb s0, 72(sp)
+; RV32I-NEXT:    sb t6, 71(sp)
+; RV32I-NEXT:    sb t5, 70(sp)
+; RV32I-NEXT:    sb t4, 69(sp)
+; RV32I-NEXT:    sb t3, 68(sp)
+; RV32I-NEXT:    sb t2, 67(sp)
+; RV32I-NEXT:    sb t1, 66(sp)
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 65(sp)
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 64(sp)
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 63(sp)
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 62(sp)
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 61(sp)
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 60(sp)
+; RV32I-NEXT:    andi a1, a1, 31
+; RV32I-NEXT:    addi a0, sp, 60
+; RV32I-NEXT:    sub a5, a0, a1
+; RV32I-NEXT:    lbu a0, 6(a5)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz s2, .LBB10_36
-; RV32I-NEXT:  # %bb.35:
-; RV32I-NEXT:    mv s2, s10
-; RV32I-NEXT:    j .LBB10_37
-; RV32I-NEXT:  .LBB10_36:
-; RV32I-NEXT:    sll a0, a4, a0
-; RV32I-NEXT:    or s2, t1, a0
-; RV32I-NEXT:  .LBB10_37:
-; RV32I-NEXT:    lw a0, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    srl s10, a0, s7
-; RV32I-NEXT:    slli s8, s8, 1
-; RV32I-NEXT:    not a0, s11
-; RV32I-NEXT:    bltz s5, .LBB10_39
-; RV32I-NEXT:  # %bb.38:
-; RV32I-NEXT:    mv t4, s8
-; RV32I-NEXT:    mv s6, s10
-; RV32I-NEXT:    mv s8, ra
-; RV32I-NEXT:    bltu s11, t2, .LBB10_40
-; RV32I-NEXT:    j .LBB10_41
-; RV32I-NEXT:  .LBB10_39:
-; RV32I-NEXT:    mv t4, s8
-; RV32I-NEXT:    sll s8, s8, a0
-; RV32I-NEXT:    mv s6, s10
-; RV32I-NEXT:    or s8, s10, s8
-; RV32I-NEXT:    bgeu s11, t2, .LBB10_41
-; RV32I-NEXT:  .LBB10_40:
-; RV32I-NEXT:    slti t3, t3, 0
-; RV32I-NEXT:    neg t3, t3
-; RV32I-NEXT:    and a1, t3, a1
-; RV32I-NEXT:    or s2, s8, a1
-; RV32I-NEXT:  .LBB10_41:
-; RV32I-NEXT:    beqz s11, .LBB10_43
-; RV32I-NEXT:  # %bb.42:
-; RV32I-NEXT:    sw s2, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:  .LBB10_43:
-; RV32I-NEXT:    mv s11, s4
-; RV32I-NEXT:    slti a1, s4, 0
-; RV32I-NEXT:    neg s8, a1
-; RV32I-NEXT:    slti a1, s0, 0
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    li t3, 128
-; RV32I-NEXT:    bltu a5, t3, .LBB10_45
-; RV32I-NEXT:  # %bb.44:
-; RV32I-NEXT:    lw s4, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and t3, a1, s4
-; RV32I-NEXT:    and t3, s1, t3
-; RV32I-NEXT:    mv s2, t5
-; RV32I-NEXT:    bnez a5, .LBB10_46
-; RV32I-NEXT:    j .LBB10_47
-; RV32I-NEXT:  .LBB10_45:
-; RV32I-NEXT:    lw t3, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and t3, s8, t3
-; RV32I-NEXT:    and t3, s9, t3
-; RV32I-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or t3, t3, s0
-; RV32I-NEXT:    lw s4, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    mv s2, t5
-; RV32I-NEXT:    beqz a5, .LBB10_47
-; RV32I-NEXT:  .LBB10_46:
-; RV32I-NEXT:    mv s2, t3
-; RV32I-NEXT:  .LBB10_47:
-; RV32I-NEXT:    lw t3, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgez t3, .LBB10_49
-; RV32I-NEXT:  # %bb.48:
-; RV32I-NEXT:    srl t3, t5, s7
-; RV32I-NEXT:    slli s3, s3, 1
-; RV32I-NEXT:    lw t5, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sll t5, s3, t5
-; RV32I-NEXT:    or t3, t3, t5
-; RV32I-NEXT:    sw t3, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:  .LBB10_49:
-; RV32I-NEXT:    lw s3, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slti t3, s3, 0
-; RV32I-NEXT:    neg t5, t3
-; RV32I-NEXT:    bltu a5, t2, .LBB10_51
-; RV32I-NEXT:  # %bb.50:
-; RV32I-NEXT:    lw t3, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and s1, t5, t3
-; RV32I-NEXT:    mv t3, a6
-; RV32I-NEXT:    bnez a5, .LBB10_52
-; RV32I-NEXT:    j .LBB10_53
-; RV32I-NEXT:  .LBB10_51:
-; RV32I-NEXT:    lw t3, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and t3, s8, t3
-; RV32I-NEXT:    lw s1, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or s1, t3, s1
-; RV32I-NEXT:    mv t3, a6
-; RV32I-NEXT:    beqz a5, .LBB10_53
-; RV32I-NEXT:  .LBB10_52:
-; RV32I-NEXT:    mv t3, s1
-; RV32I-NEXT:  .LBB10_53:
-; RV32I-NEXT:    bgez s5, .LBB10_55
-; RV32I-NEXT:  # %bb.54:
-; RV32I-NEXT:    sll a0, a4, a0
-; RV32I-NEXT:    or a0, t1, a0
+; RV32I-NEXT:    lbu a0, 7(a5)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:  .LBB10_55:
-; RV32I-NEXT:    lw a4, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw t1, 32(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw a0, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a0, .LBB10_58
-; RV32I-NEXT:  # %bb.56:
-; RV32I-NEXT:    mv a0, ra
-; RV32I-NEXT:    bgeu t6, t2, .LBB10_59
-; RV32I-NEXT:  .LBB10_57:
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    or a1, a1, a0
-; RV32I-NEXT:    mv a0, a3
-; RV32I-NEXT:    bnez t6, .LBB10_60
-; RV32I-NEXT:    j .LBB10_61
-; RV32I-NEXT:  .LBB10_58:
-; RV32I-NEXT:    li a0, 192
-; RV32I-NEXT:    sub a0, a0, a5
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    sll a0, t4, a0
-; RV32I-NEXT:    or a0, s6, a0
-; RV32I-NEXT:    bltu t6, t2, .LBB10_57
-; RV32I-NEXT:  .LBB10_59:
+; RV32I-NEXT:    lbu a0, 4(a5)
+; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 5(a5)
+; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 0(a5)
+; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a7, 1(a5)
+; RV32I-NEXT:    lbu t0, 2(a5)
+; RV32I-NEXT:    lbu t1, 3(a5)
+; RV32I-NEXT:    lbu t2, 14(a5)
+; RV32I-NEXT:    lbu t3, 15(a5)
+; RV32I-NEXT:    lbu t4, 12(a5)
+; RV32I-NEXT:    lbu t5, 13(a5)
+; RV32I-NEXT:    lbu t6, 10(a5)
+; RV32I-NEXT:    lbu s0, 11(a5)
+; RV32I-NEXT:    lbu s1, 8(a5)
+; RV32I-NEXT:    lbu s2, 9(a5)
+; RV32I-NEXT:    lbu s3, 22(a5)
+; RV32I-NEXT:    lbu s4, 23(a5)
+; RV32I-NEXT:    lbu s5, 20(a5)
+; RV32I-NEXT:    lbu s6, 21(a5)
+; RV32I-NEXT:    lbu s7, 18(a5)
+; RV32I-NEXT:    lbu s8, 19(a5)
+; RV32I-NEXT:    lbu s9, 16(a5)
+; RV32I-NEXT:    lbu s10, 17(a5)
+; RV32I-NEXT:    lbu s11, 30(a5)
+; RV32I-NEXT:    lbu ra, 31(a5)
+; RV32I-NEXT:    lbu a6, 28(a5)
+; RV32I-NEXT:    lbu a4, 29(a5)
+; RV32I-NEXT:    lbu a0, 25(a5)
+; RV32I-NEXT:    lbu a1, 24(a5)
+; RV32I-NEXT:    lbu a3, 27(a5)
+; RV32I-NEXT:    lbu a5, 26(a5)
+; RV32I-NEXT:    sb a0, 25(a2)
+; RV32I-NEXT:    sb a1, 24(a2)
+; RV32I-NEXT:    sb a3, 27(a2)
+; RV32I-NEXT:    sb a5, 26(a2)
+; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    sb a6, 28(a2)
+; RV32I-NEXT:    sb ra, 31(a2)
+; RV32I-NEXT:    sb s11, 30(a2)
+; RV32I-NEXT:    sb s10, 17(a2)
+; RV32I-NEXT:    sb s9, 16(a2)
+; RV32I-NEXT:    sb s8, 19(a2)
+; RV32I-NEXT:    sb s7, 18(a2)
+; RV32I-NEXT:    sb s6, 21(a2)
+; RV32I-NEXT:    sb s5, 20(a2)
+; RV32I-NEXT:    sb s4, 23(a2)
+; RV32I-NEXT:    sb s3, 22(a2)
+; RV32I-NEXT:    sb s2, 9(a2)
+; RV32I-NEXT:    sb s1, 8(a2)
+; RV32I-NEXT:    sb s0, 11(a2)
+; RV32I-NEXT:    sb t6, 10(a2)
+; RV32I-NEXT:    sb t5, 13(a2)
+; RV32I-NEXT:    sb t4, 12(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t1, 3(a2)
+; RV32I-NEXT:    sb t0, 2(a2)
+; RV32I-NEXT:    sb a7, 1(a2)
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 5(a2)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slti a0, a0, 0
-; RV32I-NEXT:    neg a0, a0
-; RV32I-NEXT:    and a1, a0, s4
-; RV32I-NEXT:    mv a0, a3
-; RV32I-NEXT:    beqz t6, .LBB10_61
-; RV32I-NEXT:  .LBB10_60:
-; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:  .LBB10_61:
-; RV32I-NEXT:    li a1, 128
-; RV32I-NEXT:    bltu a5, a1, .LBB10_70
-; RV32I-NEXT:  # %bb.62:
-; RV32I-NEXT:    lw t3, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bnez a5, .LBB10_71
-; RV32I-NEXT:  .LBB10_63:
-; RV32I-NEXT:    mv a0, t3
-; RV32I-NEXT:    bgez s3, .LBB10_72
-; RV32I-NEXT:  .LBB10_64:
-; RV32I-NEXT:    bgez s11, .LBB10_73
-; RV32I-NEXT:  .LBB10_65:
-; RV32I-NEXT:    bltu a5, t2, .LBB10_74
-; RV32I-NEXT:  .LBB10_66:
-; RV32I-NEXT:    bnez a5, .LBB10_75
-; RV32I-NEXT:  .LBB10_67:
-; RV32I-NEXT:    lw a0, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a0, .LBB10_76
-; RV32I-NEXT:  .LBB10_68:
-; RV32I-NEXT:    sltiu a0, a5, 128
-; RV32I-NEXT:    bltu a5, t2, .LBB10_77
-; RV32I-NEXT:  .LBB10_69:
-; RV32I-NEXT:    and a1, t5, s4
-; RV32I-NEXT:    neg a4, a0
-; RV32I-NEXT:    bnez a5, .LBB10_78
-; RV32I-NEXT:    j .LBB10_79
-; RV32I-NEXT:  .LBB10_70:
+; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a0, t1, a0
-; RV32I-NEXT:    or a0, t3, a0
-; RV32I-NEXT:    lw t3, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    beqz a5, .LBB10_63
-; RV32I-NEXT:  .LBB10_71:
-; RV32I-NEXT:    mv a6, a0
-; RV32I-NEXT:    mv a0, t3
-; RV32I-NEXT:    bltz s3, .LBB10_64
-; RV32I-NEXT:  .LBB10_72:
-; RV32I-NEXT:    mv a0, s4
-; RV32I-NEXT:    bltz s11, .LBB10_65
-; RV32I-NEXT:  .LBB10_73:
-; RV32I-NEXT:    sw a4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgeu a5, t2, .LBB10_66
-; RV32I-NEXT:  .LBB10_74:
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a0, a0, ra
-; RV32I-NEXT:    lw a1, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    beqz a5, .LBB10_67
-; RV32I-NEXT:  .LBB10_75:
-; RV32I-NEXT:    sw a0, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgez a0, .LBB10_68
-; RV32I-NEXT:  .LBB10_76:
+; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sll a0, t4, a0
-; RV32I-NEXT:    or ra, s6, a0
-; RV32I-NEXT:    sltiu a0, a5, 128
-; RV32I-NEXT:    bgeu a5, t2, .LBB10_69
-; RV32I-NEXT:  .LBB10_77:
-; RV32I-NEXT:    and a1, s8, a4
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    neg a4, a0
-; RV32I-NEXT:    beqz a5, .LBB10_79
-; RV32I-NEXT:  .LBB10_78:
-; RV32I-NEXT:    mv a3, a1
-; RV32I-NEXT:  .LBB10_79:
-; RV32I-NEXT:    lw a1, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, a4, a1
-; RV32I-NEXT:    and a0, a4, a3
-; RV32I-NEXT:    bltz s11, .LBB10_81
-; RV32I-NEXT:  # %bb.80:
-; RV32I-NEXT:    mv t3, s4
-; RV32I-NEXT:  .LBB10_81:
-; RV32I-NEXT:    and a3, a4, t3
-; RV32I-NEXT:    and a3, a3, s9
-; RV32I-NEXT:    and a5, s8, s4
-; RV32I-NEXT:    and a4, a4, a5
-; RV32I-NEXT:    and a4, a4, s9
-; RV32I-NEXT:    sb a4, 0(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    srli a5, a4, 24
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 2(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    srli a4, a3, 16
-; RV32I-NEXT:    sb a4, 6(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb a1, 12(a2)
-; RV32I-NEXT:    sb a0, 8(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 14(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 13(a2)
-; RV32I-NEXT:    sb a7, 28(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    sb a1, 11(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    sb a1, 10(a2)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    srli a0, a7, 24
-; RV32I-NEXT:    sb a0, 31(a2)
-; RV32I-NEXT:    srli a0, a7, 16
-; RV32I-NEXT:    sb a0, 30(a2)
-; RV32I-NEXT:    srli a0, a7, 8
-; RV32I-NEXT:    sb a0, 29(a2)
-; RV32I-NEXT:    sb s2, 16(a2)
-; RV32I-NEXT:    srli a0, a6, 24
-; RV32I-NEXT:    sb a0, 27(a2)
-; RV32I-NEXT:    srli a0, a6, 16
-; RV32I-NEXT:    sb a0, 26(a2)
-; RV32I-NEXT:    srli a0, a6, 8
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    srli a0, s2, 24
-; RV32I-NEXT:    sb a0, 19(a2)
-; RV32I-NEXT:    srli a0, s2, 16
-; RV32I-NEXT:    sb a0, 18(a2)
-; RV32I-NEXT:    srli a0, s2, 8
-; RV32I-NEXT:    sb a0, 17(a2)
-; RV32I-NEXT:    sb t0, 20(a2)
-; RV32I-NEXT:    srli a0, t0, 24
-; RV32I-NEXT:    sb a0, 23(a2)
-; RV32I-NEXT:    srli a0, t0, 16
-; RV32I-NEXT:    sb a0, 22(a2)
-; RV32I-NEXT:    srli a0, t0, 8
-; RV32I-NEXT:    sb a0, 21(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 144
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -3147,895 +2162,455 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd s0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
-; RV64I-NEXT:    lbu a5, 10(a0)
-; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, t0, a4
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a3, a3, a6
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a6, 2(a0)
-; RV64I-NEXT:    lbu a7, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or t0, a7, a4
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or t1, a7, a4
-; RV64I-NEXT:    slli t1, t1, 32
-; RV64I-NEXT:    lbu a4, 25(a0)
-; RV64I-NEXT:    lbu a5, 24(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a7, 27(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    lbu a5, 29(a0)
-; RV64I-NEXT:    lbu a6, 28(a0)
-; RV64I-NEXT:    lbu t2, 30(a0)
-; RV64I-NEXT:    lbu t3, 31(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or a5, t2, a5
-; RV64I-NEXT:    or a5, t3, a5
-; RV64I-NEXT:    slli a6, a5, 32
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or a7, a4, a7
-; RV64I-NEXT:    lbu a4, 17(a0)
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu t2, 18(a0)
-; RV64I-NEXT:    lbu t3, 19(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or a4, t2, a4
-; RV64I-NEXT:    lbu a6, 21(a0)
-; RV64I-NEXT:    lbu t2, 20(a0)
-; RV64I-NEXT:    lbu t4, 22(a0)
-; RV64I-NEXT:    lbu a0, 23(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, t2
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a6, t4, a6
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a4
-; RV64I-NEXT:    or t2, a0, t3
-; RV64I-NEXT:    lbu a0, 5(a1)
-; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
-; RV64I-NEXT:    lbu t3, 7(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a4
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or a0, a6, a0
-; RV64I-NEXT:    or a0, t3, a0
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu t3, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
-; RV64I-NEXT:    slli t3, t3, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a4, t3, a4
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a0, a0, 35
-; RV64I-NEXT:    or a6, a0, a1
-; RV64I-NEXT:    srl a0, t2, a6
-; RV64I-NEXT:    not t5, a6
-; RV64I-NEXT:    slli a1, a7, 1
-; RV64I-NEXT:    sll a1, a1, t5
-; RV64I-NEXT:    or a1, a0, a1
-; RV64I-NEXT:    addi t3, a6, -192
-; RV64I-NEXT:    sra a4, a7, a6
-; RV64I-NEXT:    mv t6, a1
-; RV64I-NEXT:    bltz t3, .LBB11_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    mv t6, a4
-; RV64I-NEXT:  .LBB11_2:
-; RV64I-NEXT:    or a0, t1, t0
-; RV64I-NEXT:    addi t0, a6, -64
-; RV64I-NEXT:    srl t4, a3, a6
-; RV64I-NEXT:    bltz t0, .LBB11_4
-; RV64I-NEXT:  # %bb.3:
-; RV64I-NEXT:    mv s2, t4
-; RV64I-NEXT:    j .LBB11_5
-; RV64I-NEXT:  .LBB11_4:
-; RV64I-NEXT:    srl t1, a0, a6
-; RV64I-NEXT:    slli s0, a3, 1
-; RV64I-NEXT:    sll t5, s0, t5
-; RV64I-NEXT:    or s2, t1, t5
-; RV64I-NEXT:  .LBB11_5:
-; RV64I-NEXT:    negw s0, a6
-; RV64I-NEXT:    sll t5, t2, s0
-; RV64I-NEXT:    li s1, 64
-; RV64I-NEXT:    li t1, 128
-; RV64I-NEXT:    sub s1, s1, a6
-; RV64I-NEXT:    bltu a6, t1, .LBB11_18
-; RV64I-NEXT:  # %bb.6:
-; RV64I-NEXT:    bnez a6, .LBB11_19
-; RV64I-NEXT:  .LBB11_7:
-; RV64I-NEXT:    bgez s1, .LBB11_9
-; RV64I-NEXT:  .LBB11_8:
-; RV64I-NEXT:    sll a7, a7, s0
-; RV64I-NEXT:    srli t2, t2, 1
-; RV64I-NEXT:    subw t5, t1, a6
-; RV64I-NEXT:    not t5, t5
-; RV64I-NEXT:    srl t2, t2, t5
-; RV64I-NEXT:    or t5, a7, t2
-; RV64I-NEXT:  .LBB11_9:
-; RV64I-NEXT:    sraiw a5, a5, 31
-; RV64I-NEXT:    mv a7, a4
-; RV64I-NEXT:    bgez t3, .LBB11_20
-; RV64I-NEXT:  # %bb.10:
-; RV64I-NEXT:    bltu a6, t1, .LBB11_21
-; RV64I-NEXT:  .LBB11_11:
-; RV64I-NEXT:    bnez a6, .LBB11_22
-; RV64I-NEXT:  .LBB11_12:
-; RV64I-NEXT:    bgez t0, .LBB11_23
-; RV64I-NEXT:  .LBB11_13:
-; RV64I-NEXT:    bgeu a6, t1, .LBB11_24
-; RV64I-NEXT:  .LBB11_14:
-; RV64I-NEXT:    bgez t0, .LBB11_25
-; RV64I-NEXT:  .LBB11_15:
-; RV64I-NEXT:    bltu a6, t1, .LBB11_17
-; RV64I-NEXT:  .LBB11_16:
-; RV64I-NEXT:    mv a4, a5
-; RV64I-NEXT:  .LBB11_17:
-; RV64I-NEXT:    sb a4, 24(a2)
-; RV64I-NEXT:    srli a5, a4, 56
-; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a4, 48
-; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    srli a5, a4, 40
-; RV64I-NEXT:    sb a5, 29(a2)
-; RV64I-NEXT:    srli a5, a4, 32
-; RV64I-NEXT:    sb a5, 28(a2)
-; RV64I-NEXT:    srli a5, a4, 24
-; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, a4, 16
-; RV64I-NEXT:    sb a5, 26(a2)
-; RV64I-NEXT:    srli a4, a4, 8
-; RV64I-NEXT:    sb a4, 25(a2)
-; RV64I-NEXT:    sb a1, 16(a2)
-; RV64I-NEXT:    srli a4, a1, 56
-; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    sb a4, 22(a2)
-; RV64I-NEXT:    srli a4, a1, 40
-; RV64I-NEXT:    sb a4, 21(a2)
-; RV64I-NEXT:    srli a4, a1, 32
-; RV64I-NEXT:    sb a4, 20(a2)
-; RV64I-NEXT:    srli a4, a1, 24
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    sb a4, 18(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 17(a2)
-; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    addi sp, sp, -224
+; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv t1, a1
+; RV64I-NEXT:    lbu t0, 31(a0)
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t2, 6(a0)
+; RV64I-NEXT:    lbu t3, 7(a0)
+; RV64I-NEXT:    lbu t4, 8(a0)
+; RV64I-NEXT:    lbu t5, 9(a0)
+; RV64I-NEXT:    lbu t6, 10(a0)
+; RV64I-NEXT:    lbu s0, 11(a0)
+; RV64I-NEXT:    lbu s1, 12(a0)
+; RV64I-NEXT:    lbu s2, 13(a0)
+; RV64I-NEXT:    lbu s3, 14(a0)
+; RV64I-NEXT:    lbu s4, 15(a0)
+; RV64I-NEXT:    lbu s5, 16(a0)
+; RV64I-NEXT:    lbu s6, 17(a0)
+; RV64I-NEXT:    lbu s7, 18(a0)
+; RV64I-NEXT:    lbu s8, 19(a0)
+; RV64I-NEXT:    lbu s9, 20(a0)
+; RV64I-NEXT:    lbu s10, 21(a0)
+; RV64I-NEXT:    lbu s11, 22(a0)
+; RV64I-NEXT:    lbu ra, 23(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a5, 26(a0)
+; RV64I-NEXT:    lbu a4, 27(a0)
+; RV64I-NEXT:    lbu a1, 30(a0)
+; RV64I-NEXT:    lbu a3, 29(a0)
+; RV64I-NEXT:    lbu a0, 28(a0)
+; RV64I-NEXT:    lbu t1, 0(t1)
+; RV64I-NEXT:    sb a1, 86(sp)
+; RV64I-NEXT:    sb a3, 85(sp)
+; RV64I-NEXT:    sb a0, 84(sp)
+; RV64I-NEXT:    sb a4, 83(sp)
+; RV64I-NEXT:    sb a5, 82(sp)
+; RV64I-NEXT:    sb a6, 81(sp)
+; RV64I-NEXT:    sb t0, 87(sp)
+; RV64I-NEXT:    slli t0, t0, 56
+; RV64I-NEXT:    sb a7, 80(sp)
+; RV64I-NEXT:    sb ra, 79(sp)
+; RV64I-NEXT:    sb s11, 78(sp)
+; RV64I-NEXT:    sb s10, 77(sp)
+; RV64I-NEXT:    sb s9, 76(sp)
+; RV64I-NEXT:    sb s8, 75(sp)
+; RV64I-NEXT:    sb s7, 74(sp)
+; RV64I-NEXT:    sb s6, 73(sp)
+; RV64I-NEXT:    sb s5, 72(sp)
+; RV64I-NEXT:    sb s4, 71(sp)
+; RV64I-NEXT:    sb s3, 70(sp)
+; RV64I-NEXT:    sb s2, 69(sp)
+; RV64I-NEXT:    sb s1, 68(sp)
+; RV64I-NEXT:    sb s0, 67(sp)
+; RV64I-NEXT:    sb t6, 66(sp)
+; RV64I-NEXT:    sb t5, 65(sp)
+; RV64I-NEXT:    sb t4, 64(sp)
+; RV64I-NEXT:    sb t3, 63(sp)
+; RV64I-NEXT:    sb t2, 62(sp)
+; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 61(sp)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 60(sp)
+; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 59(sp)
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 58(sp)
+; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 57(sp)
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 56(sp)
+; RV64I-NEXT:    srai a0, t0, 63
+; RV64I-NEXT:    sb a0, 112(sp)
+; RV64I-NEXT:    sb a0, 104(sp)
+; RV64I-NEXT:    sb a0, 96(sp)
+; RV64I-NEXT:    sb a0, 88(sp)
 ; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 48
-; RV64I-NEXT:    sb a1, 6(a2)
-; RV64I-NEXT:    srli a1, a0, 40
-; RV64I-NEXT:    sb a1, 5(a2)
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    sb a1, 4(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    sb a1, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    sb a1, 119(sp)
+; RV64I-NEXT:    srli a3, a0, 48
+; RV64I-NEXT:    sb a3, 118(sp)
+; RV64I-NEXT:    srli a4, a0, 40
+; RV64I-NEXT:    sb a4, 117(sp)
+; RV64I-NEXT:    srli a5, a0, 32
+; RV64I-NEXT:    sb a5, 116(sp)
+; RV64I-NEXT:    srli a6, a0, 24
+; RV64I-NEXT:    sb a6, 115(sp)
+; RV64I-NEXT:    srli a7, a0, 16
+; RV64I-NEXT:    sb a7, 114(sp)
 ; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    sb a0, 1(a2)
-; RV64I-NEXT:    sb a3, 8(a2)
-; RV64I-NEXT:    srli a0, a3, 56
-; RV64I-NEXT:    sb a0, 15(a2)
-; RV64I-NEXT:    srli a0, a3, 48
-; RV64I-NEXT:    sb a0, 14(a2)
-; RV64I-NEXT:    srli a0, a3, 40
-; RV64I-NEXT:    sb a0, 13(a2)
-; RV64I-NEXT:    srli a0, a3, 32
+; RV64I-NEXT:    sb a0, 113(sp)
+; RV64I-NEXT:    sb a1, 111(sp)
+; RV64I-NEXT:    sb a3, 110(sp)
+; RV64I-NEXT:    sb a4, 109(sp)
+; RV64I-NEXT:    sb a5, 108(sp)
+; RV64I-NEXT:    sb a6, 107(sp)
+; RV64I-NEXT:    sb a7, 106(sp)
+; RV64I-NEXT:    sb a0, 105(sp)
+; RV64I-NEXT:    sb a1, 103(sp)
+; RV64I-NEXT:    sb a3, 102(sp)
+; RV64I-NEXT:    sb a4, 101(sp)
+; RV64I-NEXT:    sb a5, 100(sp)
+; RV64I-NEXT:    sb a6, 99(sp)
+; RV64I-NEXT:    sb a7, 98(sp)
+; RV64I-NEXT:    sb a0, 97(sp)
+; RV64I-NEXT:    sb a1, 95(sp)
+; RV64I-NEXT:    sb a3, 94(sp)
+; RV64I-NEXT:    sb a4, 93(sp)
+; RV64I-NEXT:    sb a5, 92(sp)
+; RV64I-NEXT:    sb a6, 91(sp)
+; RV64I-NEXT:    sb a7, 90(sp)
+; RV64I-NEXT:    sb a0, 89(sp)
+; RV64I-NEXT:    andi a0, t1, 31
+; RV64I-NEXT:    addi a1, sp, 56
+; RV64I-NEXT:    add a5, a1, a0
+; RV64I-NEXT:    lbu a0, 8(a5)
+; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 9(a5)
+; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 10(a5)
+; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 11(a5)
+; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 12(a5)
+; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a7, 13(a5)
+; RV64I-NEXT:    lbu t0, 14(a5)
+; RV64I-NEXT:    lbu t1, 15(a5)
+; RV64I-NEXT:    lbu t2, 0(a5)
+; RV64I-NEXT:    lbu t3, 1(a5)
+; RV64I-NEXT:    lbu t4, 2(a5)
+; RV64I-NEXT:    lbu t5, 3(a5)
+; RV64I-NEXT:    lbu t6, 4(a5)
+; RV64I-NEXT:    lbu s0, 5(a5)
+; RV64I-NEXT:    lbu s1, 6(a5)
+; RV64I-NEXT:    lbu s2, 7(a5)
+; RV64I-NEXT:    lbu s3, 24(a5)
+; RV64I-NEXT:    lbu s4, 25(a5)
+; RV64I-NEXT:    lbu s5, 26(a5)
+; RV64I-NEXT:    lbu s6, 27(a5)
+; RV64I-NEXT:    lbu s7, 28(a5)
+; RV64I-NEXT:    lbu s8, 29(a5)
+; RV64I-NEXT:    lbu s9, 30(a5)
+; RV64I-NEXT:    lbu s10, 31(a5)
+; RV64I-NEXT:    lbu s11, 16(a5)
+; RV64I-NEXT:    lbu ra, 17(a5)
+; RV64I-NEXT:    lbu a6, 18(a5)
+; RV64I-NEXT:    lbu a4, 19(a5)
+; RV64I-NEXT:    lbu a0, 23(a5)
+; RV64I-NEXT:    lbu a1, 22(a5)
+; RV64I-NEXT:    lbu a3, 21(a5)
+; RV64I-NEXT:    lbu a5, 20(a5)
+; RV64I-NEXT:    sb a0, 23(a2)
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    sb a3, 21(a2)
+; RV64I-NEXT:    sb a5, 20(a2)
+; RV64I-NEXT:    sb a4, 19(a2)
+; RV64I-NEXT:    sb a6, 18(a2)
+; RV64I-NEXT:    sb ra, 17(a2)
+; RV64I-NEXT:    sb s11, 16(a2)
+; RV64I-NEXT:    sb s10, 31(a2)
+; RV64I-NEXT:    sb s9, 30(a2)
+; RV64I-NEXT:    sb s8, 29(a2)
+; RV64I-NEXT:    sb s7, 28(a2)
+; RV64I-NEXT:    sb s6, 27(a2)
+; RV64I-NEXT:    sb s5, 26(a2)
+; RV64I-NEXT:    sb s4, 25(a2)
+; RV64I-NEXT:    sb s3, 24(a2)
+; RV64I-NEXT:    sb s2, 7(a2)
+; RV64I-NEXT:    sb s1, 6(a2)
+; RV64I-NEXT:    sb s0, 5(a2)
+; RV64I-NEXT:    sb t6, 4(a2)
+; RV64I-NEXT:    sb t5, 3(a2)
+; RV64I-NEXT:    sb t4, 2(a2)
+; RV64I-NEXT:    sb t3, 1(a2)
+; RV64I-NEXT:    sb t2, 0(a2)
+; RV64I-NEXT:    sb t1, 15(a2)
+; RV64I-NEXT:    sb t0, 14(a2)
+; RV64I-NEXT:    sb a7, 13(a2)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    srli a0, a3, 24
+; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    srli a0, a3, 16
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    ld s0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 224
 ; RV64I-NEXT:    ret
-; RV64I-NEXT:  .LBB11_18:
-; RV64I-NEXT:    slti t6, s1, 0
-; RV64I-NEXT:    neg t6, t6
-; RV64I-NEXT:    and t6, t6, t5
-; RV64I-NEXT:    or t6, s2, t6
-; RV64I-NEXT:    beqz a6, .LBB11_7
-; RV64I-NEXT:  .LBB11_19:
-; RV64I-NEXT:    mv a0, t6
-; RV64I-NEXT:    bltz s1, .LBB11_8
-; RV64I-NEXT:    j .LBB11_9
-; RV64I-NEXT:  .LBB11_20:
-; RV64I-NEXT:    mv a7, a5
-; RV64I-NEXT:    bgeu a6, t1, .LBB11_11
-; RV64I-NEXT:  .LBB11_21:
-; RV64I-NEXT:    slti a7, t0, 0
-; RV64I-NEXT:    neg a7, a7
-; RV64I-NEXT:    and a7, a7, t4
-; RV64I-NEXT:    or a7, a7, t5
-; RV64I-NEXT:    beqz a6, .LBB11_12
-; RV64I-NEXT:  .LBB11_22:
-; RV64I-NEXT:    mv a3, a7
-; RV64I-NEXT:    bltz t0, .LBB11_13
-; RV64I-NEXT:  .LBB11_23:
-; RV64I-NEXT:    mv a1, a4
-; RV64I-NEXT:    bltu a6, t1, .LBB11_14
-; RV64I-NEXT:  .LBB11_24:
-; RV64I-NEXT:    mv a1, a5
-; RV64I-NEXT:    bltz t0, .LBB11_15
-; RV64I-NEXT:  .LBB11_25:
-; RV64I-NEXT:    mv a4, a5
-; RV64I-NEXT:    bgeu a6, t1, .LBB11_16
-; RV64I-NEXT:    j .LBB11_17
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a6, 4(a0)
-; RV32I-NEXT:    lbu t1, 5(a0)
-; RV32I-NEXT:    lbu t5, 6(a0)
-; RV32I-NEXT:    lbu t6, 7(a0)
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a5, 1(a0)
-; RV32I-NEXT:    lbu t2, 2(a0)
-; RV32I-NEXT:    lbu t3, 3(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu a4, 13(a0)
+; RV32I-NEXT:    addi sp, sp, -144
+; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t1, a1
+; RV32I-NEXT:    lbu t0, 31(a0)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t2, 6(a0)
+; RV32I-NEXT:    lbu t3, 7(a0)
+; RV32I-NEXT:    lbu t4, 8(a0)
+; RV32I-NEXT:    lbu t5, 9(a0)
+; RV32I-NEXT:    lbu t6, 10(a0)
+; RV32I-NEXT:    lbu s0, 11(a0)
+; RV32I-NEXT:    lbu s1, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
 ; RV32I-NEXT:    lbu s3, 14(a0)
 ; RV32I-NEXT:    lbu s4, 15(a0)
-; RV32I-NEXT:    lbu s2, 8(a0)
-; RV32I-NEXT:    lbu s1, 9(a0)
-; RV32I-NEXT:    lbu s7, 10(a0)
-; RV32I-NEXT:    lbu s8, 11(a0)
-; RV32I-NEXT:    lbu a3, 21(a0)
-; RV32I-NEXT:    lbu t0, 20(a0)
-; RV32I-NEXT:    lbu t4, 22(a0)
-; RV32I-NEXT:    lbu s5, 23(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t0
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    or a3, t4, a3
-; RV32I-NEXT:    or a3, s5, a3
-; RV32I-NEXT:    lbu t0, 17(a0)
-; RV32I-NEXT:    lbu t4, 16(a0)
-; RV32I-NEXT:    lbu s5, 18(a0)
-; RV32I-NEXT:    lbu s6, 19(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t4
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    or t0, s5, t0
-; RV32I-NEXT:    or t4, s6, t0
-; RV32I-NEXT:    lbu t0, 29(a0)
-; RV32I-NEXT:    lbu s5, 28(a0)
-; RV32I-NEXT:    lbu s6, 30(a0)
-; RV32I-NEXT:    lbu s9, 31(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, s5
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    or t0, s6, t0
-; RV32I-NEXT:    sw s9, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    or s10, s9, t0
-; RV32I-NEXT:    lbu t0, 25(a0)
-; RV32I-NEXT:    lbu s5, 24(a0)
-; RV32I-NEXT:    lbu s6, 26(a0)
-; RV32I-NEXT:    lbu a0, 27(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, s5
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or t0, s6, t0
-; RV32I-NEXT:    or s9, a0, t0
-; RV32I-NEXT:    lbu a0, 1(a1)
-; RV32I-NEXT:    lbu t0, 0(a1)
-; RV32I-NEXT:    lbu s5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, t0
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a0, s5, a0
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    slli a1, a0, 3
-; RV32I-NEXT:    srl a0, s9, a1
-; RV32I-NEXT:    slli t0, s10, 1
-; RV32I-NEXT:    not s5, a1
-; RV32I-NEXT:    sll t0, t0, s5
-; RV32I-NEXT:    or t0, a0, t0
-; RV32I-NEXT:    addi a0, a1, -224
-; RV32I-NEXT:    sw s10, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sra s6, s10, a1
-; RV32I-NEXT:    mv s10, t0
-; RV32I-NEXT:    sw a0, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz a0, .LBB11_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv s10, s6
-; RV32I-NEXT:  .LBB11_2:
-; RV32I-NEXT:    slli ra, a4, 8
-; RV32I-NEXT:    slli s3, s3, 16
-; RV32I-NEXT:    slli s4, s4, 24
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    slli s7, s7, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    srl a4, t4, a1
-; RV32I-NEXT:    slli a0, a3, 1
-; RV32I-NEXT:    sll s11, a0, s5
-; RV32I-NEXT:    or s11, a4, s11
-; RV32I-NEXT:    addi a7, a1, -160
-; RV32I-NEXT:    srl a4, a3, a1
-; RV32I-NEXT:    sw a4, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw a7, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz a7, .LBB11_4
-; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB11_4:
-; RV32I-NEXT:    slli a4, t1, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or s0, ra, s0
-; RV32I-NEXT:    or s3, s4, s3
-; RV32I-NEXT:    or s1, s1, s2
-; RV32I-NEXT:    or s8, s8, s7
-; RV32I-NEXT:    neg ra, a1
-; RV32I-NEXT:    sll s7, s9, ra
-; RV32I-NEXT:    li s2, 160
-; RV32I-NEXT:    addi s4, a1, -128
-; RV32I-NEXT:    li t1, 64
-; RV32I-NEXT:    sub a7, s2, a1
-; RV32I-NEXT:    sw s7, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgeu s4, t1, .LBB11_6
-; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    slti s2, a7, 0
-; RV32I-NEXT:    neg s2, s2
-; RV32I-NEXT:    and s2, s2, s7
-; RV32I-NEXT:    or s10, s11, s2
-; RV32I-NEXT:  .LBB11_6:
-; RV32I-NEXT:    sw a7, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or s2, a4, a6
-; RV32I-NEXT:    or t5, t6, t5
-; RV32I-NEXT:    or a6, s3, s0
-; RV32I-NEXT:    or s8, s8, s1
-; RV32I-NEXT:    mv a4, t4
-; RV32I-NEXT:    beqz s4, .LBB11_8
-; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv a4, s10
-; RV32I-NEXT:  .LBB11_8:
-; RV32I-NEXT:    lw a7, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    or t3, t3, t2
-; RV32I-NEXT:    or t1, t5, s2
-; RV32I-NEXT:    srl t2, s8, a1
-; RV32I-NEXT:    slli t5, a6, 1
-; RV32I-NEXT:    sll t5, t5, s5
-; RV32I-NEXT:    or t6, t2, t5
-; RV32I-NEXT:    addi s0, a1, -96
-; RV32I-NEXT:    srl t2, a6, a1
-; RV32I-NEXT:    sw t2, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv t2, t6
-; RV32I-NEXT:    li a7, 64
-; RV32I-NEXT:    bltz s0, .LBB11_10
-; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    lw t2, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB11_10:
-; RV32I-NEXT:    or s3, t3, a5
-; RV32I-NEXT:    addi t5, a1, -32
-; RV32I-NEXT:    srl a5, t1, a1
-; RV32I-NEXT:    sw s0, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw a5, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgez t5, .LBB11_12
-; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    srl a5, s3, a1
-; RV32I-NEXT:    slli t3, t1, 1
-; RV32I-NEXT:    sll t3, t3, s5
-; RV32I-NEXT:    or a5, a5, t3
-; RV32I-NEXT:  .LBB11_12:
-; RV32I-NEXT:    sll s10, s8, ra
-; RV32I-NEXT:    li t3, 32
-; RV32I-NEXT:    sub s0, t3, a1
-; RV32I-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    slti s0, s0, 0
-; RV32I-NEXT:    neg s0, s0
-; RV32I-NEXT:    bgeu a1, a7, .LBB11_14
-; RV32I-NEXT:  # %bb.13:
-; RV32I-NEXT:    and t2, s0, s10
-; RV32I-NEXT:    or t2, a5, t2
-; RV32I-NEXT:  .LBB11_14:
-; RV32I-NEXT:    sw a6, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s2, s3
-; RV32I-NEXT:    beqz a1, .LBB11_16
-; RV32I-NEXT:  # %bb.15:
-; RV32I-NEXT:    mv s2, t2
-; RV32I-NEXT:  .LBB11_16:
-; RV32I-NEXT:    sll a6, t4, ra
-; RV32I-NEXT:    li a5, 96
-; RV32I-NEXT:    sub s7, a5, a1
-; RV32I-NEXT:    slti a5, s7, 0
-; RV32I-NEXT:    neg s11, a5
-; RV32I-NEXT:    li t2, 128
-; RV32I-NEXT:    sub s0, t2, a1
-; RV32I-NEXT:    sltiu a5, s0, 64
-; RV32I-NEXT:    neg a5, a5
-; RV32I-NEXT:    bgeu a1, t2, .LBB11_18
-; RV32I-NEXT:  # %bb.17:
-; RV32I-NEXT:    and a4, s11, a6
-; RV32I-NEXT:    and a4, a5, a4
-; RV32I-NEXT:    or a4, s2, a4
-; RV32I-NEXT:  .LBB11_18:
-; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a5, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    beqz a1, .LBB11_20
-; RV32I-NEXT:  # %bb.19:
-; RV32I-NEXT:    mv s3, a4
-; RV32I-NEXT:  .LBB11_20:
-; RV32I-NEXT:    neg a4, s0
-; RV32I-NEXT:    sub a5, t3, s0
-; RV32I-NEXT:    srl t3, a3, a4
-; RV32I-NEXT:    sw a5, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz a5, .LBB11_23
-; RV32I-NEXT:  # %bb.21:
-; RV32I-NEXT:    mv a0, t3
-; RV32I-NEXT:    bgeu s0, a7, .LBB11_24
-; RV32I-NEXT:  .LBB11_22:
-; RV32I-NEXT:    and a4, s11, s2
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    mv a4, s9
-; RV32I-NEXT:    bnez s0, .LBB11_25
-; RV32I-NEXT:    j .LBB11_26
-; RV32I-NEXT:  .LBB11_23:
-; RV32I-NEXT:    srl a4, t4, a4
-; RV32I-NEXT:    sub a5, a7, s0
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    sll a0, a0, a5
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    bltu s0, a7, .LBB11_22
-; RV32I-NEXT:  .LBB11_24:
-; RV32I-NEXT:    lw a0, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a0, a0, a6
-; RV32I-NEXT:    mv a4, s9
-; RV32I-NEXT:    beqz s0, .LBB11_26
-; RV32I-NEXT:  .LBB11_25:
-; RV32I-NEXT:    mv a4, a0
-; RV32I-NEXT:  .LBB11_26:
-; RV32I-NEXT:    sw t3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz t5, .LBB11_28
-; RV32I-NEXT:  # %bb.27:
-; RV32I-NEXT:    lw t6, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB11_28:
-; RV32I-NEXT:    mv t3, t0
-; RV32I-NEXT:    lw a0, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a0, .LBB11_30
-; RV32I-NEXT:  # %bb.29:
-; RV32I-NEXT:    mv t3, s6
-; RV32I-NEXT:  .LBB11_30:
-; RV32I-NEXT:    sltiu a5, a1, 64
+; RV32I-NEXT:    lbu s5, 16(a0)
+; RV32I-NEXT:    lbu s6, 17(a0)
+; RV32I-NEXT:    lbu s7, 18(a0)
+; RV32I-NEXT:    lbu s8, 19(a0)
+; RV32I-NEXT:    lbu s9, 20(a0)
+; RV32I-NEXT:    lbu s10, 21(a0)
+; RV32I-NEXT:    lbu s11, 22(a0)
+; RV32I-NEXT:    lbu ra, 23(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu a6, 25(a0)
+; RV32I-NEXT:    lbu a5, 26(a0)
+; RV32I-NEXT:    lbu a4, 27(a0)
+; RV32I-NEXT:    lbu a1, 30(a0)
+; RV32I-NEXT:    lbu a3, 29(a0)
+; RV32I-NEXT:    lbu a0, 28(a0)
+; RV32I-NEXT:    lbu t1, 0(t1)
+; RV32I-NEXT:    sb a1, 58(sp)
+; RV32I-NEXT:    sb a3, 57(sp)
+; RV32I-NEXT:    sb a0, 56(sp)
+; RV32I-NEXT:    sb a4, 55(sp)
+; RV32I-NEXT:    sb a5, 54(sp)
+; RV32I-NEXT:    sb a6, 53(sp)
+; RV32I-NEXT:    sb t0, 59(sp)
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    sb a7, 52(sp)
+; RV32I-NEXT:    sb ra, 51(sp)
+; RV32I-NEXT:    sb s11, 50(sp)
+; RV32I-NEXT:    sb s10, 49(sp)
+; RV32I-NEXT:    sb s9, 48(sp)
+; RV32I-NEXT:    sb s8, 47(sp)
+; RV32I-NEXT:    sb s7, 46(sp)
+; RV32I-NEXT:    sb s6, 45(sp)
+; RV32I-NEXT:    sb s5, 44(sp)
+; RV32I-NEXT:    sb s4, 43(sp)
+; RV32I-NEXT:    sb s3, 42(sp)
+; RV32I-NEXT:    sb s2, 41(sp)
+; RV32I-NEXT:    sb s1, 40(sp)
+; RV32I-NEXT:    sb s0, 39(sp)
+; RV32I-NEXT:    sb t6, 38(sp)
+; RV32I-NEXT:    sb t5, 37(sp)
+; RV32I-NEXT:    sb t4, 36(sp)
+; RV32I-NEXT:    sb t3, 35(sp)
+; RV32I-NEXT:    sb t2, 34(sp)
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 33(sp)
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 32(sp)
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 31(sp)
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 30(sp)
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 29(sp)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    srai a0, a0, 31
-; RV32I-NEXT:    bltu s4, a7, .LBB11_32
-; RV32I-NEXT:  # %bb.31:
-; RV32I-NEXT:    mv t3, a0
-; RV32I-NEXT:  .LBB11_32:
-; RV32I-NEXT:    neg s1, a5
-; RV32I-NEXT:    li a5, 128
-; RV32I-NEXT:    bgeu a1, a5, .LBB11_34
-; RV32I-NEXT:  # %bb.33:
-; RV32I-NEXT:    and a5, s1, t6
-; RV32I-NEXT:    or t3, a5, a4
-; RV32I-NEXT:  .LBB11_34:
-; RV32I-NEXT:    mv a4, s8
-; RV32I-NEXT:    beqz a1, .LBB11_36
-; RV32I-NEXT:  # %bb.35:
-; RV32I-NEXT:    mv a4, t3
-; RV32I-NEXT:  .LBB11_36:
-; RV32I-NEXT:    sw a4, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sub a4, a7, a1
-; RV32I-NEXT:    not t3, a4
-; RV32I-NEXT:    lw a4, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgez a4, .LBB11_38
-; RV32I-NEXT:  # %bb.37:
-; RV32I-NEXT:    lw a4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sll a4, a4, ra
-; RV32I-NEXT:    srli a5, s8, 1
-; RV32I-NEXT:    srl a5, a5, t3
-; RV32I-NEXT:    or s10, a4, a5
-; RV32I-NEXT:  .LBB11_38:
-; RV32I-NEXT:    slti a4, t5, 0
-; RV32I-NEXT:    neg s5, a4
-; RV32I-NEXT:    li t2, 64
-; RV32I-NEXT:    bltu a1, a7, .LBB11_40
-; RV32I-NEXT:  # %bb.39:
-; RV32I-NEXT:    lw a4, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slti a4, a4, 0
-; RV32I-NEXT:    neg a4, a4
-; RV32I-NEXT:    lw a5, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a4, a4, a5
-; RV32I-NEXT:    j .LBB11_41
-; RV32I-NEXT:  .LBB11_40:
-; RV32I-NEXT:    lw a4, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a4, s5, a4
-; RV32I-NEXT:    or a4, a4, s10
-; RV32I-NEXT:  .LBB11_41:
-; RV32I-NEXT:    sw t0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv a7, t1
-; RV32I-NEXT:    beqz a1, .LBB11_43
-; RV32I-NEXT:  # %bb.42:
-; RV32I-NEXT:    mv a7, a4
-; RV32I-NEXT:  .LBB11_43:
-; RV32I-NEXT:    mv s10, t3
-; RV32I-NEXT:    sll a4, a3, ra
-; RV32I-NEXT:    srli s8, t4, 1
-; RV32I-NEXT:    not t3, s0
-; RV32I-NEXT:    mv t0, s7
-; RV32I-NEXT:    bltz s7, .LBB11_45
-; RV32I-NEXT:  # %bb.44:
-; RV32I-NEXT:    mv s7, t5
-; RV32I-NEXT:    mv s11, a6
-; RV32I-NEXT:    j .LBB11_46
-; RV32I-NEXT:  .LBB11_45:
-; RV32I-NEXT:    mv s7, t5
-; RV32I-NEXT:    srl a5, s8, t3
-; RV32I-NEXT:    or s11, a4, a5
-; RV32I-NEXT:  .LBB11_46:
-; RV32I-NEXT:    mv t5, t1
-; RV32I-NEXT:    mv t6, s3
-; RV32I-NEXT:    lw a5, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sll s3, a5, ra
-; RV32I-NEXT:    srli s9, s9, 1
-; RV32I-NEXT:    lw a5, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a5, .LBB11_48
-; RV32I-NEXT:  # %bb.47:
-; RV32I-NEXT:    mv t1, s9
-; RV32I-NEXT:    mv s9, s3
-; RV32I-NEXT:    mv s3, s2
-; RV32I-NEXT:    j .LBB11_49
-; RV32I-NEXT:  .LBB11_48:
-; RV32I-NEXT:    li a5, 192
-; RV32I-NEXT:    sub a5, a5, a1
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    mv t1, s9
-; RV32I-NEXT:    srl a5, s9, a5
-; RV32I-NEXT:    mv s9, s3
-; RV32I-NEXT:    or s3, s3, a5
-; RV32I-NEXT:  .LBB11_49:
-; RV32I-NEXT:    mv a5, s6
-; RV32I-NEXT:    lw ra, 32(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz ra, .LBB11_51
-; RV32I-NEXT:  # %bb.50:
-; RV32I-NEXT:    mv a5, a0
-; RV32I-NEXT:  .LBB11_51:
-; RV32I-NEXT:    bltu s4, t2, .LBB11_53
-; RV32I-NEXT:  # %bb.52:
-; RV32I-NEXT:    mv t2, s2
-; RV32I-NEXT:    j .LBB11_54
-; RV32I-NEXT:  .LBB11_53:
-; RV32I-NEXT:    lw a5, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slti a5, a5, 0
-; RV32I-NEXT:    neg a5, a5
-; RV32I-NEXT:    lw t2, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a5, a5, t2
-; RV32I-NEXT:    lw t2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a5, a5, s3
-; RV32I-NEXT:  .LBB11_54:
-; RV32I-NEXT:    mv s2, s1
-; RV32I-NEXT:    mv ra, s9
-; RV32I-NEXT:    mv s3, a3
-; RV32I-NEXT:    mv s9, t1
-; RV32I-NEXT:    beqz s4, .LBB11_56
-; RV32I-NEXT:  # %bb.55:
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:  .LBB11_56:
-; RV32I-NEXT:    li a5, 128
-; RV32I-NEXT:    mv t1, t5
-; RV32I-NEXT:    bltu a1, a5, .LBB11_61
-; RV32I-NEXT:  # %bb.57:
-; RV32I-NEXT:    li a7, 64
-; RV32I-NEXT:    bnez a1, .LBB11_62
-; RV32I-NEXT:  .LBB11_58:
-; RV32I-NEXT:    lw a5, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a5, .LBB11_63
-; RV32I-NEXT:  .LBB11_59:
-; RV32I-NEXT:    bltz t0, .LBB11_64
-; RV32I-NEXT:  .LBB11_60:
-; RV32I-NEXT:    mv a4, t2
-; RV32I-NEXT:    j .LBB11_65
-; RV32I-NEXT:  .LBB11_61:
-; RV32I-NEXT:    lw a5, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a5, a5, s11
-; RV32I-NEXT:    or s3, a7, a5
-; RV32I-NEXT:    li a7, 64
-; RV32I-NEXT:    beqz a1, .LBB11_58
-; RV32I-NEXT:  .LBB11_62:
-; RV32I-NEXT:    mv t1, s3
-; RV32I-NEXT:    lw a5, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgez a5, .LBB11_59
-; RV32I-NEXT:  .LBB11_63:
-; RV32I-NEXT:    srl s1, s8, s10
-; RV32I-NEXT:    or a6, a4, s1
-; RV32I-NEXT:    bgez t0, .LBB11_60
-; RV32I-NEXT:  .LBB11_64:
-; RV32I-NEXT:    srl a4, s9, t3
-; RV32I-NEXT:    or a4, ra, a4
-; RV32I-NEXT:  .LBB11_65:
-; RV32I-NEXT:    lw t3, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw t0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgeu s0, a7, .LBB11_67
-; RV32I-NEXT:  # %bb.66:
-; RV32I-NEXT:    lw a5, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slti a5, a5, 0
-; RV32I-NEXT:    neg a5, a5
-; RV32I-NEXT:    and s1, a5, t3
-; RV32I-NEXT:    or a6, a4, s1
-; RV32I-NEXT:  .LBB11_67:
-; RV32I-NEXT:    beqz s0, .LBB11_69
-; RV32I-NEXT:  # %bb.68:
-; RV32I-NEXT:    sw a6, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:  .LBB11_69:
-; RV32I-NEXT:    mv a4, s6
-; RV32I-NEXT:    lw a5, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a5, .LBB11_71
-; RV32I-NEXT:  # %bb.70:
-; RV32I-NEXT:    mv a4, a0
-; RV32I-NEXT:  .LBB11_71:
-; RV32I-NEXT:    li t3, 128
-; RV32I-NEXT:    lw s0, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw a6, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgeu s4, a7, .LBB11_92
-; RV32I-NEXT:  # %bb.72:
-; RV32I-NEXT:    bltu a1, t3, .LBB11_93
-; RV32I-NEXT:  .LBB11_73:
-; RV32I-NEXT:    bnez a1, .LBB11_94
-; RV32I-NEXT:  .LBB11_74:
-; RV32I-NEXT:    mv a4, t0
-; RV32I-NEXT:    bgez s0, .LBB11_95
-; RV32I-NEXT:  .LBB11_75:
-; RV32I-NEXT:    bgez s7, .LBB11_96
-; RV32I-NEXT:  .LBB11_76:
-; RV32I-NEXT:    bltu a1, a7, .LBB11_97
-; RV32I-NEXT:  .LBB11_77:
-; RV32I-NEXT:    bnez a1, .LBB11_98
-; RV32I-NEXT:  .LBB11_78:
-; RV32I-NEXT:    bgeu a1, t3, .LBB11_99
-; RV32I-NEXT:  .LBB11_79:
-; RV32I-NEXT:    lw a4, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a4, .LBB11_100
-; RV32I-NEXT:  .LBB11_80:
-; RV32I-NEXT:    mv a4, s6
-; RV32I-NEXT:    bgez s0, .LBB11_101
-; RV32I-NEXT:  .LBB11_81:
-; RV32I-NEXT:    bltu a1, a7, .LBB11_102
-; RV32I-NEXT:  .LBB11_82:
-; RV32I-NEXT:    bnez a1, .LBB11_103
-; RV32I-NEXT:  .LBB11_83:
-; RV32I-NEXT:    bgeu a1, t3, .LBB11_104
-; RV32I-NEXT:  .LBB11_84:
-; RV32I-NEXT:    lw a4, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgez s7, .LBB11_105
-; RV32I-NEXT:  .LBB11_85:
-; RV32I-NEXT:    bgeu a1, a7, .LBB11_106
-; RV32I-NEXT:  .LBB11_86:
-; RV32I-NEXT:    bgeu a1, t3, .LBB11_107
-; RV32I-NEXT:  .LBB11_87:
-; RV32I-NEXT:    bgez s7, .LBB11_108
-; RV32I-NEXT:  .LBB11_88:
-; RV32I-NEXT:    bgeu a1, a7, .LBB11_109
-; RV32I-NEXT:  .LBB11_89:
-; RV32I-NEXT:    bltu a1, t3, .LBB11_91
-; RV32I-NEXT:  .LBB11_90:
-; RV32I-NEXT:    mv s6, a0
-; RV32I-NEXT:  .LBB11_91:
-; RV32I-NEXT:    sb s6, 28(a2)
-; RV32I-NEXT:    srli a0, s6, 24
-; RV32I-NEXT:    sb a0, 31(a2)
-; RV32I-NEXT:    srli a0, s6, 16
-; RV32I-NEXT:    sb a0, 30(a2)
-; RV32I-NEXT:    srli a0, s6, 8
-; RV32I-NEXT:    sb a0, 29(a2)
-; RV32I-NEXT:    sb t0, 24(a2)
-; RV32I-NEXT:    srli a0, t0, 24
-; RV32I-NEXT:    sb a0, 27(a2)
-; RV32I-NEXT:    srli a0, t0, 16
-; RV32I-NEXT:    sb a0, 26(a2)
-; RV32I-NEXT:    srli a0, t0, 8
+; RV32I-NEXT:    sb a0, 28(sp)
+; RV32I-NEXT:    srai a0, t0, 31
+; RV32I-NEXT:    sb a0, 88(sp)
+; RV32I-NEXT:    sb a0, 84(sp)
+; RV32I-NEXT:    sb a0, 80(sp)
+; RV32I-NEXT:    sb a0, 76(sp)
+; RV32I-NEXT:    sb a0, 72(sp)
+; RV32I-NEXT:    sb a0, 68(sp)
+; RV32I-NEXT:    sb a0, 64(sp)
+; RV32I-NEXT:    sb a0, 60(sp)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 91(sp)
+; RV32I-NEXT:    srli a3, a0, 16
+; RV32I-NEXT:    sb a3, 90(sp)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 89(sp)
+; RV32I-NEXT:    sb a1, 87(sp)
+; RV32I-NEXT:    sb a3, 86(sp)
+; RV32I-NEXT:    sb a0, 85(sp)
+; RV32I-NEXT:    sb a1, 83(sp)
+; RV32I-NEXT:    sb a3, 82(sp)
+; RV32I-NEXT:    sb a0, 81(sp)
+; RV32I-NEXT:    sb a1, 79(sp)
+; RV32I-NEXT:    sb a3, 78(sp)
+; RV32I-NEXT:    sb a0, 77(sp)
+; RV32I-NEXT:    sb a1, 75(sp)
+; RV32I-NEXT:    sb a3, 74(sp)
+; RV32I-NEXT:    sb a0, 73(sp)
+; RV32I-NEXT:    sb a1, 71(sp)
+; RV32I-NEXT:    sb a3, 70(sp)
+; RV32I-NEXT:    sb a0, 69(sp)
+; RV32I-NEXT:    sb a1, 67(sp)
+; RV32I-NEXT:    sb a3, 66(sp)
+; RV32I-NEXT:    sb a0, 65(sp)
+; RV32I-NEXT:    sb a1, 63(sp)
+; RV32I-NEXT:    sb a3, 62(sp)
+; RV32I-NEXT:    sb a0, 61(sp)
+; RV32I-NEXT:    andi a0, t1, 31
+; RV32I-NEXT:    addi a1, sp, 28
+; RV32I-NEXT:    add a5, a1, a0
+; RV32I-NEXT:    lbu a0, 6(a5)
+; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 7(a5)
+; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 4(a5)
+; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 5(a5)
+; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 0(a5)
+; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a7, 1(a5)
+; RV32I-NEXT:    lbu t0, 2(a5)
+; RV32I-NEXT:    lbu t1, 3(a5)
+; RV32I-NEXT:    lbu t2, 14(a5)
+; RV32I-NEXT:    lbu t3, 15(a5)
+; RV32I-NEXT:    lbu t4, 12(a5)
+; RV32I-NEXT:    lbu t5, 13(a5)
+; RV32I-NEXT:    lbu t6, 10(a5)
+; RV32I-NEXT:    lbu s0, 11(a5)
+; RV32I-NEXT:    lbu s1, 8(a5)
+; RV32I-NEXT:    lbu s2, 9(a5)
+; RV32I-NEXT:    lbu s3, 22(a5)
+; RV32I-NEXT:    lbu s4, 23(a5)
+; RV32I-NEXT:    lbu s5, 20(a5)
+; RV32I-NEXT:    lbu s6, 21(a5)
+; RV32I-NEXT:    lbu s7, 18(a5)
+; RV32I-NEXT:    lbu s8, 19(a5)
+; RV32I-NEXT:    lbu s9, 16(a5)
+; RV32I-NEXT:    lbu s10, 17(a5)
+; RV32I-NEXT:    lbu s11, 30(a5)
+; RV32I-NEXT:    lbu ra, 31(a5)
+; RV32I-NEXT:    lbu a6, 28(a5)
+; RV32I-NEXT:    lbu a4, 29(a5)
+; RV32I-NEXT:    lbu a0, 25(a5)
+; RV32I-NEXT:    lbu a1, 24(a5)
+; RV32I-NEXT:    lbu a3, 27(a5)
+; RV32I-NEXT:    lbu a5, 26(a5)
 ; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb t4, 16(a2)
-; RV32I-NEXT:    srli a0, t4, 24
-; RV32I-NEXT:    sb a0, 19(a2)
-; RV32I-NEXT:    srli a0, t4, 16
-; RV32I-NEXT:    sb a0, 18(a2)
-; RV32I-NEXT:    srli a0, t4, 8
-; RV32I-NEXT:    sb a0, 17(a2)
-; RV32I-NEXT:    sb a3, 20(a2)
-; RV32I-NEXT:    srli a0, a3, 24
-; RV32I-NEXT:    sb a0, 23(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    sb a0, 22(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 21(a2)
-; RV32I-NEXT:    sb t6, 0(a2)
-; RV32I-NEXT:    sb a6, 12(a2)
-; RV32I-NEXT:    srli a0, t6, 24
-; RV32I-NEXT:    sb a0, 3(a2)
-; RV32I-NEXT:    srli a0, t6, 16
-; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    srli a0, t6, 8
-; RV32I-NEXT:    sb a0, 1(a2)
-; RV32I-NEXT:    sb t1, 4(a2)
-; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    srli a0, a6, 24
-; RV32I-NEXT:    sb a0, 15(a2)
-; RV32I-NEXT:    srli a0, a6, 16
-; RV32I-NEXT:    sb a0, 14(a2)
-; RV32I-NEXT:    srli a0, a6, 8
-; RV32I-NEXT:    sb a0, 13(a2)
-; RV32I-NEXT:    srli a0, t1, 24
+; RV32I-NEXT:    sb a1, 24(a2)
+; RV32I-NEXT:    sb a3, 27(a2)
+; RV32I-NEXT:    sb a5, 26(a2)
+; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    sb a6, 28(a2)
+; RV32I-NEXT:    sb ra, 31(a2)
+; RV32I-NEXT:    sb s11, 30(a2)
+; RV32I-NEXT:    sb s10, 17(a2)
+; RV32I-NEXT:    sb s9, 16(a2)
+; RV32I-NEXT:    sb s8, 19(a2)
+; RV32I-NEXT:    sb s7, 18(a2)
+; RV32I-NEXT:    sb s6, 21(a2)
+; RV32I-NEXT:    sb s5, 20(a2)
+; RV32I-NEXT:    sb s4, 23(a2)
+; RV32I-NEXT:    sb s3, 22(a2)
+; RV32I-NEXT:    sb s2, 9(a2)
+; RV32I-NEXT:    sb s1, 8(a2)
+; RV32I-NEXT:    sb s0, 11(a2)
+; RV32I-NEXT:    sb t6, 10(a2)
+; RV32I-NEXT:    sb t5, 13(a2)
+; RV32I-NEXT:    sb t4, 12(a2)
+; RV32I-NEXT:    sb t3, 15(a2)
+; RV32I-NEXT:    sb t2, 14(a2)
+; RV32I-NEXT:    sb t1, 3(a2)
+; RV32I-NEXT:    sb t0, 2(a2)
+; RV32I-NEXT:    sb a7, 1(a2)
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    srli a0, t1, 16
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    srli a0, t1, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, a4, 24
-; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a0, a4, 16
-; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    srli a0, a4, 8
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 144
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB11_92:
-; RV32I-NEXT:    mv a4, a0
-; RV32I-NEXT:    bgeu a1, t3, .LBB11_73
-; RV32I-NEXT:  .LBB11_93:
-; RV32I-NEXT:    lw a4, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a4, s5, a4
-; RV32I-NEXT:    and a4, s2, a4
-; RV32I-NEXT:    lw a5, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    beqz a1, .LBB11_74
-; RV32I-NEXT:  .LBB11_94:
-; RV32I-NEXT:    mv a6, a4
-; RV32I-NEXT:    mv a4, t0
-; RV32I-NEXT:    bltz s0, .LBB11_75
-; RV32I-NEXT:  .LBB11_95:
-; RV32I-NEXT:    mv a4, s6
-; RV32I-NEXT:    bltz s7, .LBB11_76
-; RV32I-NEXT:  .LBB11_96:
-; RV32I-NEXT:    lw a5, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgeu a1, a7, .LBB11_77
-; RV32I-NEXT:  .LBB11_97:
-; RV32I-NEXT:    lw a4, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a4, a4, t2
-; RV32I-NEXT:    lw a5, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    beqz a1, .LBB11_78
-; RV32I-NEXT:  .LBB11_98:
-; RV32I-NEXT:    mv t4, a4
-; RV32I-NEXT:    bltu a1, t3, .LBB11_79
-; RV32I-NEXT:  .LBB11_99:
-; RV32I-NEXT:    mv t4, a0
-; RV32I-NEXT:    lw a4, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgez a4, .LBB11_80
-; RV32I-NEXT:  .LBB11_100:
-; RV32I-NEXT:    srl a4, s9, s10
-; RV32I-NEXT:    or t2, ra, a4
-; RV32I-NEXT:    mv a4, s6
-; RV32I-NEXT:    bltz s0, .LBB11_81
-; RV32I-NEXT:  .LBB11_101:
-; RV32I-NEXT:    mv a4, a0
-; RV32I-NEXT:    bgeu a1, a7, .LBB11_82
-; RV32I-NEXT:  .LBB11_102:
-; RV32I-NEXT:    lw a4, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a4, s5, a4
-; RV32I-NEXT:    or a4, a4, t2
-; RV32I-NEXT:    beqz a1, .LBB11_83
-; RV32I-NEXT:  .LBB11_103:
-; RV32I-NEXT:    mv a3, a4
-; RV32I-NEXT:    bltu a1, t3, .LBB11_84
-; RV32I-NEXT:  .LBB11_104:
-; RV32I-NEXT:    mv a3, a0
-; RV32I-NEXT:    lw a4, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz s7, .LBB11_85
-; RV32I-NEXT:  .LBB11_105:
-; RV32I-NEXT:    mv t0, s6
-; RV32I-NEXT:    bltu a1, a7, .LBB11_86
-; RV32I-NEXT:  .LBB11_106:
-; RV32I-NEXT:    mv t0, a0
-; RV32I-NEXT:    bltu a1, t3, .LBB11_87
-; RV32I-NEXT:  .LBB11_107:
-; RV32I-NEXT:    mv t0, a0
-; RV32I-NEXT:    bltz s7, .LBB11_88
-; RV32I-NEXT:  .LBB11_108:
-; RV32I-NEXT:    mv s6, a0
-; RV32I-NEXT:    bltu a1, a7, .LBB11_89
-; RV32I-NEXT:  .LBB11_109:
-; RV32I-NEXT:    mv s6, a0
-; RV32I-NEXT:    bgeu a1, t3, .LBB11_90
-; RV32I-NEXT:    j .LBB11_91
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
   %bitOff = shl i256 %byteOff, 3

diff  --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index 055f66c1d45e0..c26c13d61afe7 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -707,167 +707,164 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    lbu a4, 4(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 7(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    addi sp, sp, -64
+; RV32I-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 2(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a7, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or t2, t0, a6
-; RV32I-NEXT:    lbu a4, 13(a0)
-; RV32I-NEXT:    lbu a5, 12(a0)
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t0, 15(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a4, a6, a4
-; RV32I-NEXT:    or a6, t0, a4
-; RV32I-NEXT:    lbu a4, 9(a0)
-; RV32I-NEXT:    lbu a5, 8(a0)
-; RV32I-NEXT:    lbu t0, 10(a0)
-; RV32I-NEXT:    lbu a0, 11(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a4, t0, a4
-; RV32I-NEXT:    or t0, a0, a4
-; RV32I-NEXT:    lbu a0, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 1(a1)
+; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s3, 13(a0)
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    or s0, s0, s1
+; RV32I-NEXT:    lbu s1, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    or a1, a1, s0
+; RV32I-NEXT:    sb zero, 43(sp)
+; RV32I-NEXT:    sb zero, 42(sp)
+; RV32I-NEXT:    sb zero, 41(sp)
+; RV32I-NEXT:    sb zero, 40(sp)
+; RV32I-NEXT:    sb zero, 39(sp)
+; RV32I-NEXT:    sb zero, 38(sp)
+; RV32I-NEXT:    sb zero, 37(sp)
+; RV32I-NEXT:    sb zero, 36(sp)
+; RV32I-NEXT:    sb zero, 35(sp)
+; RV32I-NEXT:    sb zero, 34(sp)
+; RV32I-NEXT:    sb zero, 33(sp)
+; RV32I-NEXT:    sb zero, 32(sp)
+; RV32I-NEXT:    sb zero, 31(sp)
+; RV32I-NEXT:    sb zero, 30(sp)
+; RV32I-NEXT:    sb zero, 29(sp)
+; RV32I-NEXT:    sb zero, 28(sp)
+; RV32I-NEXT:    sb a0, 27(sp)
+; RV32I-NEXT:    sb s4, 26(sp)
+; RV32I-NEXT:    sb s3, 25(sp)
+; RV32I-NEXT:    sb s2, 24(sp)
+; RV32I-NEXT:    sb t6, 23(sp)
+; RV32I-NEXT:    sb t5, 22(sp)
+; RV32I-NEXT:    sb t4, 21(sp)
+; RV32I-NEXT:    sb t3, 20(sp)
+; RV32I-NEXT:    sb t2, 19(sp)
+; RV32I-NEXT:    sb t1, 18(sp)
+; RV32I-NEXT:    sb t0, 17(sp)
+; RV32I-NEXT:    sb a7, 16(sp)
+; RV32I-NEXT:    sb a6, 15(sp)
+; RV32I-NEXT:    sb a5, 14(sp)
+; RV32I-NEXT:    sb a4, 13(sp)
+; RV32I-NEXT:    sb a3, 12(sp)
+; RV32I-NEXT:    slli a0, a1, 25
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    addi a3, sp, 12
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    lbu a0, 5(a3)
+; RV32I-NEXT:    lbu a4, 4(a3)
+; RV32I-NEXT:    lbu a5, 6(a3)
+; RV32I-NEXT:    lbu a6, 7(a3)
 ; RV32I-NEXT:    slli a0, a0, 8
 ; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a0, a5, a0
-; RV32I-NEXT:    or a1, a1, a0
-; RV32I-NEXT:    srl a0, t0, a1
-; RV32I-NEXT:    not t4, a1
-; RV32I-NEXT:    slli a4, a6, 1
-; RV32I-NEXT:    sll a4, a4, t4
-; RV32I-NEXT:    or a4, a0, a4
-; RV32I-NEXT:    addi t1, a1, -96
-; RV32I-NEXT:    srl a5, a6, a1
-; RV32I-NEXT:    mv t3, a4
-; RV32I-NEXT:    bltz t1, .LBB6_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv t3, a5
-; RV32I-NEXT:  .LBB6_2:
-; RV32I-NEXT:    or a0, t2, a7
-; RV32I-NEXT:    addi a7, a1, -32
-; RV32I-NEXT:    srl t2, a3, a1
-; RV32I-NEXT:    bltz a7, .LBB6_4
-; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv s1, t2
-; RV32I-NEXT:    j .LBB6_5
-; RV32I-NEXT:  .LBB6_4:
-; RV32I-NEXT:    srl t5, a0, a1
-; RV32I-NEXT:    slli t6, a3, 1
-; RV32I-NEXT:    sll t4, t6, t4
-; RV32I-NEXT:    or s1, t5, t4
-; RV32I-NEXT:  .LBB6_5:
-; RV32I-NEXT:    neg t6, a1
-; RV32I-NEXT:    sll t4, t0, t6
-; RV32I-NEXT:    li s0, 32
-; RV32I-NEXT:    li t5, 64
-; RV32I-NEXT:    sub s0, s0, a1
-; RV32I-NEXT:    bltu a1, t5, .LBB6_11
-; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    bnez a1, .LBB6_12
-; RV32I-NEXT:  .LBB6_7:
-; RV32I-NEXT:    bgez s0, .LBB6_9
-; RV32I-NEXT:  .LBB6_8:
-; RV32I-NEXT:    sll a6, a6, t6
-; RV32I-NEXT:    srli t0, t0, 1
-; RV32I-NEXT:    sub t3, t5, a1
-; RV32I-NEXT:    not t3, t3
-; RV32I-NEXT:    srl t0, t0, t3
-; RV32I-NEXT:    or t4, a6, t0
-; RV32I-NEXT:  .LBB6_9:
-; RV32I-NEXT:    slti a6, a7, 0
-; RV32I-NEXT:    neg a6, a6
-; RV32I-NEXT:    bltu a1, t5, .LBB6_13
-; RV32I-NEXT:  # %bb.10:
-; RV32I-NEXT:    slti t0, t1, 0
-; RV32I-NEXT:    neg t0, t0
-; RV32I-NEXT:    and t0, t0, a5
-; RV32I-NEXT:    bnez a1, .LBB6_14
-; RV32I-NEXT:    j .LBB6_15
-; RV32I-NEXT:  .LBB6_11:
-; RV32I-NEXT:    slti t3, s0, 0
-; RV32I-NEXT:    neg t3, t3
-; RV32I-NEXT:    and t3, t3, t4
-; RV32I-NEXT:    or t3, s1, t3
-; RV32I-NEXT:    beqz a1, .LBB6_7
-; RV32I-NEXT:  .LBB6_12:
-; RV32I-NEXT:    mv a0, t3
-; RV32I-NEXT:    bltz s0, .LBB6_8
-; RV32I-NEXT:    j .LBB6_9
-; RV32I-NEXT:  .LBB6_13:
-; RV32I-NEXT:    and t0, a6, t2
-; RV32I-NEXT:    or t0, t0, t4
-; RV32I-NEXT:    beqz a1, .LBB6_15
-; RV32I-NEXT:  .LBB6_14:
-; RV32I-NEXT:    mv a3, t0
-; RV32I-NEXT:  .LBB6_15:
-; RV32I-NEXT:    bltz a7, .LBB6_17
-; RV32I-NEXT:  # %bb.16:
-; RV32I-NEXT:    mv a4, a5
-; RV32I-NEXT:  .LBB6_17:
-; RV32I-NEXT:    sltiu a1, a1, 64
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    and a4, a1, a4
-; RV32I-NEXT:    and a5, a6, a5
-; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb a1, 12(a2)
-; RV32I-NEXT:    srli a5, a4, 16
+; RV32I-NEXT:    or a4, a6, a0
+; RV32I-NEXT:    andi a5, a1, 7
+; RV32I-NEXT:    srl a0, a4, a5
+; RV32I-NEXT:    lbu a1, 9(a3)
+; RV32I-NEXT:    lbu a6, 8(a3)
+; RV32I-NEXT:    lbu a7, 10(a3)
+; RV32I-NEXT:    lbu t0, 11(a3)
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a6, t0, a1
+; RV32I-NEXT:    slli a1, a6, 1
+; RV32I-NEXT:    not a7, a5
+; RV32I-NEXT:    sll a1, a1, a7
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    lbu a7, 1(a3)
+; RV32I-NEXT:    lbu t0, 0(a3)
+; RV32I-NEXT:    lbu t1, 2(a3)
+; RV32I-NEXT:    lbu t2, 3(a3)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a7, t1, a7
+; RV32I-NEXT:    or a7, t2, a7
+; RV32I-NEXT:    srl a7, a7, a5
+; RV32I-NEXT:    slli a4, a4, 1
+; RV32I-NEXT:    xori t0, a5, 31
+; RV32I-NEXT:    sll a4, a4, t0
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    srl a6, a6, a5
+; RV32I-NEXT:    lbu t1, 13(a3)
+; RV32I-NEXT:    lbu t2, 12(a3)
+; RV32I-NEXT:    lbu t3, 14(a3)
+; RV32I-NEXT:    lbu a3, 15(a3)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or t1, t3, t1
+; RV32I-NEXT:    or a3, a3, t1
+; RV32I-NEXT:    slli t1, a3, 1
+; RV32I-NEXT:    sll t0, t1, t0
+; RV32I-NEXT:    or t0, a6, t0
+; RV32I-NEXT:    srl a3, a3, a5
+; RV32I-NEXT:    sb a6, 8(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb a7, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a5, a6, 16
 ; RV32I-NEXT:    sb a5, 10(a2)
-; RV32I-NEXT:    srli a5, a4, 24
-; RV32I-NEXT:    sb a5, 11(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    srli a4, a1, 24
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 13(a2)
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    sb a1, 2(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 1(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    srli a0, a3, 24
-; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a5, a6, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a5, a3, 16
+; RV32I-NEXT:    sb a5, 14(a2)
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    sb a5, 15(a2)
 ; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a7, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a3, a7, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a3, a0, 16
+; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 11(a2)
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    sb a4, 3(a2)
+; RV32I-NEXT:    srli a1, a1, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -994,167 +991,164 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 9(a0)
-; RV32I-NEXT:    lbu a4, 8(a0)
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu a6, 11(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or a3, a6, a3
-; RV32I-NEXT:    lbu a4, 13(a0)
-; RV32I-NEXT:    lbu a5, 12(a0)
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t0, 15(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a7, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or t2, t0, a6
+; RV32I-NEXT:    addi sp, sp, -64
+; RV32I-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
 ; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 2(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a4, a6, a4
-; RV32I-NEXT:    or a6, t0, a4
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu t0, 6(a0)
-; RV32I-NEXT:    lbu a0, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a4, t0, a4
-; RV32I-NEXT:    or t0, a0, a4
-; RV32I-NEXT:    lbu a0, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 1(a1)
+; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s3, 13(a0)
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    or s0, s0, s1
+; RV32I-NEXT:    lbu s1, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    or a1, a1, s0
+; RV32I-NEXT:    sb zero, 27(sp)
+; RV32I-NEXT:    sb zero, 26(sp)
+; RV32I-NEXT:    sb zero, 25(sp)
+; RV32I-NEXT:    sb zero, 24(sp)
+; RV32I-NEXT:    sb zero, 23(sp)
+; RV32I-NEXT:    sb zero, 22(sp)
+; RV32I-NEXT:    sb zero, 21(sp)
+; RV32I-NEXT:    sb zero, 20(sp)
+; RV32I-NEXT:    sb zero, 19(sp)
+; RV32I-NEXT:    sb zero, 18(sp)
+; RV32I-NEXT:    sb zero, 17(sp)
+; RV32I-NEXT:    sb zero, 16(sp)
+; RV32I-NEXT:    sb zero, 15(sp)
+; RV32I-NEXT:    sb zero, 14(sp)
+; RV32I-NEXT:    sb zero, 13(sp)
+; RV32I-NEXT:    sb zero, 12(sp)
+; RV32I-NEXT:    sb a0, 43(sp)
+; RV32I-NEXT:    sb s4, 42(sp)
+; RV32I-NEXT:    sb s3, 41(sp)
+; RV32I-NEXT:    sb s2, 40(sp)
+; RV32I-NEXT:    sb t6, 39(sp)
+; RV32I-NEXT:    sb t5, 38(sp)
+; RV32I-NEXT:    sb t4, 37(sp)
+; RV32I-NEXT:    sb t3, 36(sp)
+; RV32I-NEXT:    sb t2, 35(sp)
+; RV32I-NEXT:    sb t1, 34(sp)
+; RV32I-NEXT:    sb t0, 33(sp)
+; RV32I-NEXT:    sb a7, 32(sp)
+; RV32I-NEXT:    sb a6, 31(sp)
+; RV32I-NEXT:    sb a5, 30(sp)
+; RV32I-NEXT:    sb a4, 29(sp)
+; RV32I-NEXT:    sb a3, 28(sp)
+; RV32I-NEXT:    slli a0, a1, 25
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    addi a3, sp, 28
+; RV32I-NEXT:    sub a3, a3, a0
+; RV32I-NEXT:    lbu a0, 5(a3)
+; RV32I-NEXT:    lbu a4, 4(a3)
+; RV32I-NEXT:    lbu a5, 6(a3)
+; RV32I-NEXT:    lbu a6, 7(a3)
 ; RV32I-NEXT:    slli a0, a0, 8
 ; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a0, a5, a0
-; RV32I-NEXT:    or a1, a1, a0
-; RV32I-NEXT:    sll a0, t0, a1
-; RV32I-NEXT:    not t4, a1
-; RV32I-NEXT:    srli a4, a6, 1
-; RV32I-NEXT:    srl a4, a4, t4
-; RV32I-NEXT:    or a4, a0, a4
-; RV32I-NEXT:    addi t1, a1, -96
-; RV32I-NEXT:    sll a5, a6, a1
-; RV32I-NEXT:    mv t3, a4
-; RV32I-NEXT:    bltz t1, .LBB7_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv t3, a5
-; RV32I-NEXT:  .LBB7_2:
-; RV32I-NEXT:    or a0, t2, a7
-; RV32I-NEXT:    addi a7, a1, -32
-; RV32I-NEXT:    sll t2, a3, a1
-; RV32I-NEXT:    bltz a7, .LBB7_4
-; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv s1, t2
-; RV32I-NEXT:    j .LBB7_5
-; RV32I-NEXT:  .LBB7_4:
-; RV32I-NEXT:    sll t5, a0, a1
-; RV32I-NEXT:    srli t6, a3, 1
-; RV32I-NEXT:    srl t4, t6, t4
-; RV32I-NEXT:    or s1, t5, t4
-; RV32I-NEXT:  .LBB7_5:
-; RV32I-NEXT:    neg t6, a1
-; RV32I-NEXT:    srl t4, t0, t6
-; RV32I-NEXT:    li s0, 32
-; RV32I-NEXT:    li t5, 64
-; RV32I-NEXT:    sub s0, s0, a1
-; RV32I-NEXT:    bltu a1, t5, .LBB7_11
-; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    bnez a1, .LBB7_12
-; RV32I-NEXT:  .LBB7_7:
-; RV32I-NEXT:    bgez s0, .LBB7_9
-; RV32I-NEXT:  .LBB7_8:
-; RV32I-NEXT:    srl a6, a6, t6
-; RV32I-NEXT:    slli t0, t0, 1
-; RV32I-NEXT:    sub t3, t5, a1
-; RV32I-NEXT:    not t3, t3
-; RV32I-NEXT:    sll t0, t0, t3
-; RV32I-NEXT:    or t4, a6, t0
-; RV32I-NEXT:  .LBB7_9:
-; RV32I-NEXT:    slti a6, a7, 0
-; RV32I-NEXT:    neg a6, a6
-; RV32I-NEXT:    bltu a1, t5, .LBB7_13
-; RV32I-NEXT:  # %bb.10:
-; RV32I-NEXT:    slti t0, t1, 0
-; RV32I-NEXT:    neg t0, t0
-; RV32I-NEXT:    and t0, t0, a5
-; RV32I-NEXT:    bnez a1, .LBB7_14
-; RV32I-NEXT:    j .LBB7_15
-; RV32I-NEXT:  .LBB7_11:
-; RV32I-NEXT:    slti t3, s0, 0
-; RV32I-NEXT:    neg t3, t3
-; RV32I-NEXT:    and t3, t3, t4
-; RV32I-NEXT:    or t3, s1, t3
-; RV32I-NEXT:    beqz a1, .LBB7_7
-; RV32I-NEXT:  .LBB7_12:
-; RV32I-NEXT:    mv a0, t3
-; RV32I-NEXT:    bltz s0, .LBB7_8
-; RV32I-NEXT:    j .LBB7_9
-; RV32I-NEXT:  .LBB7_13:
-; RV32I-NEXT:    and t0, a6, t2
-; RV32I-NEXT:    or t0, t0, t4
-; RV32I-NEXT:    beqz a1, .LBB7_15
-; RV32I-NEXT:  .LBB7_14:
-; RV32I-NEXT:    mv a3, t0
-; RV32I-NEXT:  .LBB7_15:
-; RV32I-NEXT:    bltz a7, .LBB7_17
-; RV32I-NEXT:  # %bb.16:
-; RV32I-NEXT:    mv a4, a5
-; RV32I-NEXT:  .LBB7_17:
-; RV32I-NEXT:    sltiu a1, a1, 64
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    and a4, a1, a4
-; RV32I-NEXT:    and a5, a6, a5
-; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
-; RV32I-NEXT:    srli a5, a1, 16
-; RV32I-NEXT:    sb a5, 2(a2)
-; RV32I-NEXT:    srli a5, a1, 24
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 6(a2)
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a0, 12(a2)
-; RV32I-NEXT:    sb a3, 8(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    sb a1, 14(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    sb a1, 15(a2)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 13(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    srli a0, a3, 24
-; RV32I-NEXT:    sb a0, 11(a2)
+; RV32I-NEXT:    or a4, a6, a0
+; RV32I-NEXT:    andi a5, a1, 7
+; RV32I-NEXT:    sll a0, a4, a5
+; RV32I-NEXT:    lbu a1, 1(a3)
+; RV32I-NEXT:    lbu a6, 0(a3)
+; RV32I-NEXT:    lbu a7, 2(a3)
+; RV32I-NEXT:    lbu t0, 3(a3)
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a6, t0, a1
+; RV32I-NEXT:    srli a1, a6, 1
+; RV32I-NEXT:    xori a7, a5, 31
+; RV32I-NEXT:    srl a1, a1, a7
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    lbu t0, 13(a3)
+; RV32I-NEXT:    lbu t1, 12(a3)
+; RV32I-NEXT:    lbu t2, 14(a3)
+; RV32I-NEXT:    lbu t3, 15(a3)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t0, t2, t0
+; RV32I-NEXT:    or t0, t3, t0
+; RV32I-NEXT:    sll t0, t0, a5
+; RV32I-NEXT:    lbu t1, 9(a3)
+; RV32I-NEXT:    lbu t2, 8(a3)
+; RV32I-NEXT:    lbu t3, 10(a3)
+; RV32I-NEXT:    lbu a3, 11(a3)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or t1, t3, t1
+; RV32I-NEXT:    or a3, a3, t1
+; RV32I-NEXT:    srli t1, a3, 1
+; RV32I-NEXT:    srl a7, t1, a7
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    sll a3, a3, a5
+; RV32I-NEXT:    srli a4, a4, 1
+; RV32I-NEXT:    not t1, a5
+; RV32I-NEXT:    srl a4, a4, t1
+; RV32I-NEXT:    or a4, a3, a4
+; RV32I-NEXT:    sll a5, a6, a5
+; RV32I-NEXT:    sb a5, 0(a2)
+; RV32I-NEXT:    srli a6, a3, 16
+; RV32I-NEXT:    sb a6, 10(a2)
+; RV32I-NEXT:    srli a6, a3, 24
+; RV32I-NEXT:    sb a6, 11(a2)
 ; RV32I-NEXT:    srli a3, a3, 8
 ; RV32I-NEXT:    sb a3, 9(a2)
-; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    srli a3, t0, 16
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    srli a3, t0, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a3, t0, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a5, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a3, a5, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 1(a2)
+; RV32I-NEXT:    srli a3, a0, 16
+; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    srli a3, a0, 24
+; RV32I-NEXT:    sb a3, 7(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    sb a4, 8(a2)
+; RV32I-NEXT:    sb a7, 12(a2)
+; RV32I-NEXT:    sb a1, 4(a2)
+; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1280,185 +1274,172 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    lbu a4, 4(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 7(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or a3, a6, a3
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    addi sp, sp, -64
+; RV32I-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 15(a0)
+; RV32I-NEXT:    slli a4, a3, 24
 ; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
 ; RV32I-NEXT:    lbu a7, 2(a0)
 ; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a6, a4, a5
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or t0, t0, a7
-; RV32I-NEXT:    lbu a4, 13(a0)
-; RV32I-NEXT:    lbu a5, 12(a0)
-; RV32I-NEXT:    lbu a7, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a5, a7, 16
-; RV32I-NEXT:    slli a7, t1, 24
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    or t1, a7, a4
-; RV32I-NEXT:    lbu a4, 9(a0)
-; RV32I-NEXT:    lbu a5, 8(a0)
-; RV32I-NEXT:    lbu t2, 10(a0)
-; RV32I-NEXT:    lbu a0, 11(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a4, t2, a4
-; RV32I-NEXT:    or t2, a0, a4
-; RV32I-NEXT:    lbu a0, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t2, 5(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 7(a0)
+; RV32I-NEXT:    lbu t5, 8(a0)
+; RV32I-NEXT:    lbu t6, 9(a0)
+; RV32I-NEXT:    lbu s0, 10(a0)
+; RV32I-NEXT:    lbu s1, 1(a1)
+; RV32I-NEXT:    lbu s2, 0(a1)
+; RV32I-NEXT:    lbu s3, 11(a0)
+; RV32I-NEXT:    lbu s4, 12(a0)
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s1, s1, s2
+; RV32I-NEXT:    lbu s2, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    lbu s5, 13(a0)
+; RV32I-NEXT:    lbu a0, 14(a0)
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    or a1, a1, s1
+; RV32I-NEXT:    sb a3, 23(sp)
+; RV32I-NEXT:    sb a0, 22(sp)
+; RV32I-NEXT:    sb s5, 21(sp)
+; RV32I-NEXT:    sb s4, 20(sp)
+; RV32I-NEXT:    sb s3, 19(sp)
+; RV32I-NEXT:    sb s0, 18(sp)
+; RV32I-NEXT:    sb t6, 17(sp)
+; RV32I-NEXT:    sb t5, 16(sp)
+; RV32I-NEXT:    sb t4, 15(sp)
+; RV32I-NEXT:    sb t3, 14(sp)
+; RV32I-NEXT:    sb t2, 13(sp)
+; RV32I-NEXT:    sb t1, 12(sp)
+; RV32I-NEXT:    sb t0, 11(sp)
+; RV32I-NEXT:    sb a7, 10(sp)
+; RV32I-NEXT:    sb a6, 9(sp)
+; RV32I-NEXT:    sb a5, 8(sp)
+; RV32I-NEXT:    srai a4, a4, 31
+; RV32I-NEXT:    sb a4, 36(sp)
+; RV32I-NEXT:    sb a4, 32(sp)
+; RV32I-NEXT:    sb a4, 28(sp)
+; RV32I-NEXT:    sb a4, 24(sp)
+; RV32I-NEXT:    srli a0, a4, 24
+; RV32I-NEXT:    sb a0, 39(sp)
+; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    sb a3, 38(sp)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 37(sp)
+; RV32I-NEXT:    sb a0, 35(sp)
+; RV32I-NEXT:    sb a3, 34(sp)
+; RV32I-NEXT:    sb a4, 33(sp)
+; RV32I-NEXT:    sb a0, 31(sp)
+; RV32I-NEXT:    sb a3, 30(sp)
+; RV32I-NEXT:    sb a4, 29(sp)
+; RV32I-NEXT:    sb a0, 27(sp)
+; RV32I-NEXT:    sb a3, 26(sp)
+; RV32I-NEXT:    sb a4, 25(sp)
+; RV32I-NEXT:    slli a0, a1, 25
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    addi a3, sp, 8
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    lbu a0, 5(a3)
+; RV32I-NEXT:    lbu a4, 4(a3)
+; RV32I-NEXT:    lbu a5, 6(a3)
+; RV32I-NEXT:    lbu a6, 7(a3)
 ; RV32I-NEXT:    slli a0, a0, 8
 ; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a0, a5, a0
-; RV32I-NEXT:    or a5, a1, a0
-; RV32I-NEXT:    srl a0, t2, a5
-; RV32I-NEXT:    not t5, a5
-; RV32I-NEXT:    slli a1, t1, 1
-; RV32I-NEXT:    sll a1, a1, t5
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    addi t3, a5, -96
-; RV32I-NEXT:    sra a4, t1, a5
-; RV32I-NEXT:    mv t6, a0
-; RV32I-NEXT:    bltz t3, .LBB8_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv t6, a4
-; RV32I-NEXT:  .LBB8_2:
-; RV32I-NEXT:    or a1, t0, a6
-; RV32I-NEXT:    addi a6, a5, -32
-; RV32I-NEXT:    srl t4, a3, a5
-; RV32I-NEXT:    bltz a6, .LBB8_4
-; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv s2, t4
-; RV32I-NEXT:    j .LBB8_5
-; RV32I-NEXT:  .LBB8_4:
-; RV32I-NEXT:    srl t0, a1, a5
-; RV32I-NEXT:    slli s0, a3, 1
-; RV32I-NEXT:    sll t5, s0, t5
-; RV32I-NEXT:    or s2, t0, t5
-; RV32I-NEXT:  .LBB8_5:
-; RV32I-NEXT:    neg s0, a5
-; RV32I-NEXT:    sll t5, t2, s0
-; RV32I-NEXT:    li s1, 32
-; RV32I-NEXT:    li t0, 64
-; RV32I-NEXT:    sub s1, s1, a5
-; RV32I-NEXT:    bltu a5, t0, .LBB8_18
-; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    bnez a5, .LBB8_19
-; RV32I-NEXT:  .LBB8_7:
-; RV32I-NEXT:    bgez s1, .LBB8_9
-; RV32I-NEXT:  .LBB8_8:
-; RV32I-NEXT:    sll t1, t1, s0
-; RV32I-NEXT:    srli t2, t2, 1
-; RV32I-NEXT:    sub t5, t0, a5
-; RV32I-NEXT:    not t5, t5
-; RV32I-NEXT:    srl t2, t2, t5
-; RV32I-NEXT:    or t5, t1, t2
-; RV32I-NEXT:  .LBB8_9:
-; RV32I-NEXT:    srai a7, a7, 31
-; RV32I-NEXT:    mv t1, a4
-; RV32I-NEXT:    bgez t3, .LBB8_20
-; RV32I-NEXT:  # %bb.10:
-; RV32I-NEXT:    bltu a5, t0, .LBB8_21
-; RV32I-NEXT:  .LBB8_11:
-; RV32I-NEXT:    bnez a5, .LBB8_22
-; RV32I-NEXT:  .LBB8_12:
-; RV32I-NEXT:    bgez a6, .LBB8_23
-; RV32I-NEXT:  .LBB8_13:
-; RV32I-NEXT:    bgeu a5, t0, .LBB8_24
-; RV32I-NEXT:  .LBB8_14:
-; RV32I-NEXT:    bgez a6, .LBB8_25
-; RV32I-NEXT:  .LBB8_15:
-; RV32I-NEXT:    bltu a5, t0, .LBB8_17
-; RV32I-NEXT:  .LBB8_16:
-; RV32I-NEXT:    mv a4, a7
-; RV32I-NEXT:  .LBB8_17:
-; RV32I-NEXT:    sb a4, 12(a2)
-; RV32I-NEXT:    srli a5, a4, 16
+; RV32I-NEXT:    or a4, a6, a0
+; RV32I-NEXT:    andi a5, a1, 7
+; RV32I-NEXT:    srl a0, a4, a5
+; RV32I-NEXT:    lbu a1, 9(a3)
+; RV32I-NEXT:    lbu a6, 8(a3)
+; RV32I-NEXT:    lbu a7, 10(a3)
+; RV32I-NEXT:    lbu t0, 11(a3)
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a6, t0, a1
+; RV32I-NEXT:    slli a1, a6, 1
+; RV32I-NEXT:    not a7, a5
+; RV32I-NEXT:    sll a1, a1, a7
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    lbu a7, 1(a3)
+; RV32I-NEXT:    lbu t0, 0(a3)
+; RV32I-NEXT:    lbu t1, 2(a3)
+; RV32I-NEXT:    lbu t2, 3(a3)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a7, t1, a7
+; RV32I-NEXT:    or a7, t2, a7
+; RV32I-NEXT:    srl a7, a7, a5
+; RV32I-NEXT:    slli a4, a4, 1
+; RV32I-NEXT:    xori t0, a5, 31
+; RV32I-NEXT:    sll a4, a4, t0
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    srl a6, a6, a5
+; RV32I-NEXT:    lbu t1, 13(a3)
+; RV32I-NEXT:    lbu t2, 12(a3)
+; RV32I-NEXT:    lbu t3, 14(a3)
+; RV32I-NEXT:    lbu a3, 15(a3)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or t1, t3, t1
+; RV32I-NEXT:    or a3, a3, t1
+; RV32I-NEXT:    slli t1, a3, 1
+; RV32I-NEXT:    sll t0, t1, t0
+; RV32I-NEXT:    or t0, a6, t0
+; RV32I-NEXT:    sra a3, a3, a5
+; RV32I-NEXT:    sb a6, 8(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb a7, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a5, a6, 16
+; RV32I-NEXT:    sb a5, 10(a2)
+; RV32I-NEXT:    srli a5, a6, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a5, a3, 16
 ; RV32I-NEXT:    sb a5, 14(a2)
-; RV32I-NEXT:    srli a5, a4, 24
+; RV32I-NEXT:    srli a5, a3, 24
 ; RV32I-NEXT:    sb a5, 15(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 13(a2)
-; RV32I-NEXT:    sb a0, 8(a2)
-; RV32I-NEXT:    srli a4, a0, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, a0, 24
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    srli a0, a1, 16
-; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    srli a0, a1, 24
-; RV32I-NEXT:    sb a0, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    srli a0, a3, 24
-; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a7, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a3, a7, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a3, a0, 16
+; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 11(a2)
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    sb a4, 3(a2)
+; RV32I-NEXT:    srli a1, a1, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB8_18:
-; RV32I-NEXT:    slti t6, s1, 0
-; RV32I-NEXT:    neg t6, t6
-; RV32I-NEXT:    and t6, t6, t5
-; RV32I-NEXT:    or t6, s2, t6
-; RV32I-NEXT:    beqz a5, .LBB8_7
-; RV32I-NEXT:  .LBB8_19:
-; RV32I-NEXT:    mv a1, t6
-; RV32I-NEXT:    bltz s1, .LBB8_8
-; RV32I-NEXT:    j .LBB8_9
-; RV32I-NEXT:  .LBB8_20:
-; RV32I-NEXT:    mv t1, a7
-; RV32I-NEXT:    bgeu a5, t0, .LBB8_11
-; RV32I-NEXT:  .LBB8_21:
-; RV32I-NEXT:    slti t1, a6, 0
-; RV32I-NEXT:    neg t1, t1
-; RV32I-NEXT:    and t1, t1, t4
-; RV32I-NEXT:    or t1, t1, t5
-; RV32I-NEXT:    beqz a5, .LBB8_12
-; RV32I-NEXT:  .LBB8_22:
-; RV32I-NEXT:    mv a3, t1
-; RV32I-NEXT:    bltz a6, .LBB8_13
-; RV32I-NEXT:  .LBB8_23:
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    bltu a5, t0, .LBB8_14
-; RV32I-NEXT:  .LBB8_24:
-; RV32I-NEXT:    mv a0, a7
-; RV32I-NEXT:    bltz a6, .LBB8_15
-; RV32I-NEXT:  .LBB8_25:
-; RV32I-NEXT:    mv a4, a7
-; RV32I-NEXT:    bgeu a5, t0, .LBB8_16
-; RV32I-NEXT:    j .LBB8_17
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
   %res = ashr i128 %src, %bitOff
@@ -1469,822 +1450,654 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
-; RV64I-NEXT:    lbu a5, 10(a0)
-; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, t0, a4
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a3, a3, a6
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a6, 2(a0)
-; RV64I-NEXT:    lbu a7, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    addi sp, sp, -224
+; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 2(a0)
+; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 3(a0)
+; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 4(a0)
+; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 5(a0)
+; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    lbu s9, 1(a1)
+; RV64I-NEXT:    lbu s10, 0(a1)
+; RV64I-NEXT:    lbu s11, 2(a1)
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s9, s9, s10
+; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    lbu s10, 5(a1)
+; RV64I-NEXT:    lbu ra, 4(a1)
+; RV64I-NEXT:    or s9, s11, s9
+; RV64I-NEXT:    lbu s11, 6(a1)
+; RV64I-NEXT:    slli s10, s10, 8
+; RV64I-NEXT:    or s10, s10, ra
+; RV64I-NEXT:    lbu ra, 7(a1)
+; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    or s10, s11, s10
+; RV64I-NEXT:    lbu s11, 21(a0)
+; RV64I-NEXT:    slli ra, ra, 24
+; RV64I-NEXT:    or s10, ra, s10
+; RV64I-NEXT:    lbu ra, 22(a0)
+; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli s10, s10, 32
+; RV64I-NEXT:    or s9, s10, s9
+; RV64I-NEXT:    lbu s10, 23(a0)
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or t0, s9, a1
+; RV64I-NEXT:    lbu s9, 24(a0)
+; RV64I-NEXT:    lbu a7, 25(a0)
+; RV64I-NEXT:    lbu a6, 26(a0)
+; RV64I-NEXT:    lbu a5, 27(a0)
+; RV64I-NEXT:    lbu a1, 31(a0)
+; RV64I-NEXT:    lbu a3, 30(a0)
+; RV64I-NEXT:    lbu a4, 29(a0)
+; RV64I-NEXT:    lbu a0, 28(a0)
+; RV64I-NEXT:    sb a1, 87(sp)
+; RV64I-NEXT:    sb a3, 86(sp)
+; RV64I-NEXT:    sb a4, 85(sp)
+; RV64I-NEXT:    sb a0, 84(sp)
+; RV64I-NEXT:    sb a5, 83(sp)
+; RV64I-NEXT:    sb a6, 82(sp)
+; RV64I-NEXT:    sb a7, 81(sp)
+; RV64I-NEXT:    sb s9, 80(sp)
+; RV64I-NEXT:    sb s10, 79(sp)
+; RV64I-NEXT:    sb ra, 78(sp)
+; RV64I-NEXT:    sb s11, 77(sp)
+; RV64I-NEXT:    sb s8, 76(sp)
+; RV64I-NEXT:    sb s7, 75(sp)
+; RV64I-NEXT:    sb s6, 74(sp)
+; RV64I-NEXT:    sb s5, 73(sp)
+; RV64I-NEXT:    sb s4, 72(sp)
+; RV64I-NEXT:    sb s3, 71(sp)
+; RV64I-NEXT:    sb s2, 70(sp)
+; RV64I-NEXT:    sb s1, 69(sp)
+; RV64I-NEXT:    sb s0, 68(sp)
+; RV64I-NEXT:    sb t6, 67(sp)
+; RV64I-NEXT:    sb t5, 66(sp)
+; RV64I-NEXT:    sb zero, 119(sp)
+; RV64I-NEXT:    sb zero, 118(sp)
+; RV64I-NEXT:    sb zero, 117(sp)
+; RV64I-NEXT:    sb zero, 116(sp)
+; RV64I-NEXT:    sb zero, 115(sp)
+; RV64I-NEXT:    sb zero, 114(sp)
+; RV64I-NEXT:    sb zero, 113(sp)
+; RV64I-NEXT:    sb zero, 112(sp)
+; RV64I-NEXT:    sb zero, 111(sp)
+; RV64I-NEXT:    sb zero, 110(sp)
+; RV64I-NEXT:    sb zero, 109(sp)
+; RV64I-NEXT:    sb zero, 108(sp)
+; RV64I-NEXT:    sb zero, 107(sp)
+; RV64I-NEXT:    sb zero, 106(sp)
+; RV64I-NEXT:    sb zero, 105(sp)
+; RV64I-NEXT:    sb zero, 104(sp)
+; RV64I-NEXT:    sb zero, 103(sp)
+; RV64I-NEXT:    sb zero, 102(sp)
+; RV64I-NEXT:    sb zero, 101(sp)
+; RV64I-NEXT:    sb zero, 100(sp)
+; RV64I-NEXT:    sb zero, 99(sp)
+; RV64I-NEXT:    sb zero, 98(sp)
+; RV64I-NEXT:    sb zero, 97(sp)
+; RV64I-NEXT:    sb zero, 96(sp)
+; RV64I-NEXT:    sb zero, 95(sp)
+; RV64I-NEXT:    sb zero, 94(sp)
+; RV64I-NEXT:    sb zero, 93(sp)
+; RV64I-NEXT:    sb zero, 92(sp)
+; RV64I-NEXT:    sb zero, 91(sp)
+; RV64I-NEXT:    sb zero, 90(sp)
+; RV64I-NEXT:    sb zero, 89(sp)
+; RV64I-NEXT:    sb zero, 88(sp)
+; RV64I-NEXT:    sb t4, 65(sp)
+; RV64I-NEXT:    sb t3, 64(sp)
+; RV64I-NEXT:    sb t2, 63(sp)
+; RV64I-NEXT:    sb t1, 62(sp)
+; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 61(sp)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 60(sp)
+; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 59(sp)
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 58(sp)
+; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 57(sp)
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 56(sp)
+; RV64I-NEXT:    slli a0, t0, 56
+; RV64I-NEXT:    srli a0, a0, 59
+; RV64I-NEXT:    addi a3, sp, 56
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    lbu a0, 9(a3)
+; RV64I-NEXT:    lbu a1, 8(a3)
+; RV64I-NEXT:    lbu a4, 10(a3)
+; RV64I-NEXT:    lbu a5, 11(a3)
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a0, a4, a0
+; RV64I-NEXT:    lbu a1, 13(a3)
+; RV64I-NEXT:    lbu a4, 12(a3)
+; RV64I-NEXT:    lbu a6, 14(a3)
+; RV64I-NEXT:    lbu a7, 15(a3)
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or a5, a7, a4
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a6, 4(a0)
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu t0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or t0, t0, a4
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    lbu a4, 25(a0)
-; RV64I-NEXT:    lbu a6, 24(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    or a1, a6, a1
+; RV64I-NEXT:    or a1, a7, a1
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    or a4, a0, a5
+; RV64I-NEXT:    andi a1, t0, 7
+; RV64I-NEXT:    lbu a0, 17(a3)
+; RV64I-NEXT:    lbu a5, 16(a3)
+; RV64I-NEXT:    lbu a6, 18(a3)
+; RV64I-NEXT:    lbu a7, 19(a3)
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a0, a6, a0
+; RV64I-NEXT:    lbu a5, 21(a3)
+; RV64I-NEXT:    lbu a6, 20(a3)
+; RV64I-NEXT:    lbu t0, 22(a3)
+; RV64I-NEXT:    lbu t1, 23(a3)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 29(a0)
-; RV64I-NEXT:    lbu a7, 28(a0)
-; RV64I-NEXT:    lbu t2, 30(a0)
-; RV64I-NEXT:    lbu t3, 31(a0)
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, t1, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a0, a5, a0
+; RV64I-NEXT:    or a5, a0, a7
+; RV64I-NEXT:    slli a0, a5, 1
+; RV64I-NEXT:    not a6, a1
+; RV64I-NEXT:    sll a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a3)
+; RV64I-NEXT:    lbu a7, 0(a3)
+; RV64I-NEXT:    lbu t0, 2(a3)
+; RV64I-NEXT:    lbu t1, 3(a3)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    lbu a7, 5(a3)
+; RV64I-NEXT:    lbu t0, 4(a3)
+; RV64I-NEXT:    lbu t2, 6(a3)
+; RV64I-NEXT:    lbu t3, 7(a3)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or a6, t2, a6
-; RV64I-NEXT:    or a6, t3, a6
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or a4, a4, t1
-; RV64I-NEXT:    lbu a6, 17(a0)
-; RV64I-NEXT:    lbu a7, 16(a0)
-; RV64I-NEXT:    lbu t1, 18(a0)
-; RV64I-NEXT:    lbu t2, 19(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu a7, 21(a0)
-; RV64I-NEXT:    lbu t1, 20(a0)
-; RV64I-NEXT:    lbu t3, 22(a0)
-; RV64I-NEXT:    lbu a0, 23(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t1
-; RV64I-NEXT:    slli t3, t3, 16
-; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a7, t2, a7
 ; RV64I-NEXT:    or a7, t3, a7
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    or t1, a0, t2
-; RV64I-NEXT:    lbu a0, 1(a1)
-; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
-; RV64I-NEXT:    lbu t2, 3(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a0, a7, a0
-; RV64I-NEXT:    lbu a6, 5(a1)
-; RV64I-NEXT:    lbu a7, 4(a1)
-; RV64I-NEXT:    lbu t3, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t3, t3, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a6, t3, a6
-; RV64I-NEXT:    or a1, a1, a6
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a1, a0, t2
-; RV64I-NEXT:    srl a0, t1, a1
-; RV64I-NEXT:    not t4, a1
-; RV64I-NEXT:    slli a6, a4, 1
-; RV64I-NEXT:    sll a6, a6, t4
-; RV64I-NEXT:    or a6, a0, a6
-; RV64I-NEXT:    addi t2, a1, -192
-; RV64I-NEXT:    srl a7, a4, a1
-; RV64I-NEXT:    mv t3, a6
-; RV64I-NEXT:    bltz t2, .LBB9_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    mv t3, a7
-; RV64I-NEXT:  .LBB9_2:
-; RV64I-NEXT:    or a0, t0, a5
-; RV64I-NEXT:    addi a5, a1, -64
-; RV64I-NEXT:    srl t0, a3, a1
-; RV64I-NEXT:    bltz a5, .LBB9_4
-; RV64I-NEXT:  # %bb.3:
-; RV64I-NEXT:    mv s1, t0
-; RV64I-NEXT:    j .LBB9_5
-; RV64I-NEXT:  .LBB9_4:
-; RV64I-NEXT:    srl t5, a0, a1
-; RV64I-NEXT:    slli t6, a3, 1
-; RV64I-NEXT:    sll t4, t6, t4
-; RV64I-NEXT:    or s1, t5, t4
-; RV64I-NEXT:  .LBB9_5:
-; RV64I-NEXT:    negw t6, a1
-; RV64I-NEXT:    sll t4, t1, t6
-; RV64I-NEXT:    li s0, 64
-; RV64I-NEXT:    li t5, 128
-; RV64I-NEXT:    sub s0, s0, a1
-; RV64I-NEXT:    bltu a1, t5, .LBB9_11
-; RV64I-NEXT:  # %bb.6:
-; RV64I-NEXT:    bnez a1, .LBB9_12
-; RV64I-NEXT:  .LBB9_7:
-; RV64I-NEXT:    bgez s0, .LBB9_9
-; RV64I-NEXT:  .LBB9_8:
-; RV64I-NEXT:    sll a4, a4, t6
-; RV64I-NEXT:    srli t1, t1, 1
-; RV64I-NEXT:    subw t3, t5, a1
-; RV64I-NEXT:    not t3, t3
-; RV64I-NEXT:    srl t1, t1, t3
-; RV64I-NEXT:    or t4, a4, t1
-; RV64I-NEXT:  .LBB9_9:
-; RV64I-NEXT:    slti a4, a5, 0
-; RV64I-NEXT:    neg a4, a4
-; RV64I-NEXT:    bltu a1, t5, .LBB9_13
-; RV64I-NEXT:  # %bb.10:
-; RV64I-NEXT:    slti t0, t2, 0
-; RV64I-NEXT:    neg t0, t0
-; RV64I-NEXT:    and t0, t0, a7
-; RV64I-NEXT:    bnez a1, .LBB9_14
-; RV64I-NEXT:    j .LBB9_15
-; RV64I-NEXT:  .LBB9_11:
-; RV64I-NEXT:    slti t3, s0, 0
-; RV64I-NEXT:    neg t3, t3
-; RV64I-NEXT:    and t3, t3, t4
-; RV64I-NEXT:    or t3, s1, t3
-; RV64I-NEXT:    beqz a1, .LBB9_7
-; RV64I-NEXT:  .LBB9_12:
-; RV64I-NEXT:    mv a0, t3
-; RV64I-NEXT:    bltz s0, .LBB9_8
-; RV64I-NEXT:    j .LBB9_9
-; RV64I-NEXT:  .LBB9_13:
-; RV64I-NEXT:    and t0, a4, t0
-; RV64I-NEXT:    or t0, t0, t4
-; RV64I-NEXT:    beqz a1, .LBB9_15
-; RV64I-NEXT:  .LBB9_14:
-; RV64I-NEXT:    mv a3, t0
-; RV64I-NEXT:  .LBB9_15:
-; RV64I-NEXT:    bltz a5, .LBB9_17
-; RV64I-NEXT:  # %bb.16:
-; RV64I-NEXT:    mv a6, a7
-; RV64I-NEXT:  .LBB9_17:
-; RV64I-NEXT:    sltiu a1, a1, 128
-; RV64I-NEXT:    neg a1, a1
-; RV64I-NEXT:    and a5, a1, a6
-; RV64I-NEXT:    and a4, a4, a7
-; RV64I-NEXT:    and a1, a1, a4
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 25(a3)
+; RV64I-NEXT:    lbu t0, 24(a3)
+; RV64I-NEXT:    lbu t2, 26(a3)
+; RV64I-NEXT:    or a6, a6, t1
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    lbu t0, 29(a3)
+; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    lbu t1, 28(a3)
+; RV64I-NEXT:    lbu t2, 30(a3)
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    lbu t3, 31(a3)
+; RV64I-NEXT:    or t0, t0, t1
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    or t0, t2, t0
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t0, t3, t0
+; RV64I-NEXT:    slli t1, a4, 1
+; RV64I-NEXT:    lbu a3, 27(a3)
+; RV64I-NEXT:    slli t0, t0, 32
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    xori t0, a1, 63
+; RV64I-NEXT:    sll t1, t1, t0
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    or a3, a7, a3
+; RV64I-NEXT:    slli a7, a3, 1
+; RV64I-NEXT:    sll a7, a7, t0
+; RV64I-NEXT:    srl a4, a4, a1
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    srl a1, a3, a1
+; RV64I-NEXT:    srli a3, a5, 48
+; RV64I-NEXT:    sb a3, 22(a2)
+; RV64I-NEXT:    srli a3, a5, 40
+; RV64I-NEXT:    sb a3, 21(a2)
+; RV64I-NEXT:    srli a3, a5, 32
+; RV64I-NEXT:    sb a3, 20(a2)
+; RV64I-NEXT:    srli a3, a5, 24
+; RV64I-NEXT:    sb a3, 19(a2)
+; RV64I-NEXT:    srli a3, a5, 16
+; RV64I-NEXT:    sb a3, 18(a2)
+; RV64I-NEXT:    or a3, a5, a7
 ; RV64I-NEXT:    sb a5, 16(a2)
-; RV64I-NEXT:    sb a1, 24(a2)
-; RV64I-NEXT:    srli a4, a5, 56
-; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a5, 48
-; RV64I-NEXT:    sb a4, 22(a2)
-; RV64I-NEXT:    srli a4, a5, 40
-; RV64I-NEXT:    sb a4, 21(a2)
-; RV64I-NEXT:    srli a4, a5, 32
-; RV64I-NEXT:    sb a4, 20(a2)
-; RV64I-NEXT:    srli a4, a5, 24
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    srli a4, a5, 16
-; RV64I-NEXT:    sb a4, 18(a2)
 ; RV64I-NEXT:    srli a5, a5, 8
 ; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a4, a1, 56
-; RV64I-NEXT:    sb a4, 31(a2)
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    sb a4, 30(a2)
-; RV64I-NEXT:    srli a4, a1, 40
-; RV64I-NEXT:    sb a4, 29(a2)
-; RV64I-NEXT:    srli a4, a1, 32
-; RV64I-NEXT:    sb a4, 28(a2)
-; RV64I-NEXT:    srli a4, a1, 24
-; RV64I-NEXT:    sb a4, 27(a2)
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    sb a4, 26(a2)
+; RV64I-NEXT:    srli a5, a1, 56
+; RV64I-NEXT:    sb a5, 31(a2)
+; RV64I-NEXT:    srli a5, a1, 48
+; RV64I-NEXT:    sb a5, 30(a2)
+; RV64I-NEXT:    srli a5, a1, 40
+; RV64I-NEXT:    sb a5, 29(a2)
+; RV64I-NEXT:    srli a5, a1, 32
+; RV64I-NEXT:    sb a5, 28(a2)
+; RV64I-NEXT:    srli a5, a1, 24
+; RV64I-NEXT:    sb a5, 27(a2)
+; RV64I-NEXT:    srli a5, a1, 16
+; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    sb a1, 24(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
 ; RV64I-NEXT:    sb a1, 25(a2)
-; RV64I-NEXT:    sb a0, 0(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    srli a1, a6, 48
 ; RV64I-NEXT:    sb a1, 6(a2)
-; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    srli a1, a6, 40
 ; RV64I-NEXT:    sb a1, 5(a2)
-; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    srli a1, a6, 32
 ; RV64I-NEXT:    sb a1, 4(a2)
-; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    srli a1, a6, 24
 ; RV64I-NEXT:    sb a1, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    srli a1, a6, 16
 ; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    sb a0, 1(a2)
-; RV64I-NEXT:    sb a3, 8(a2)
-; RV64I-NEXT:    srli a0, a3, 56
+; RV64I-NEXT:    or a1, a6, t1
+; RV64I-NEXT:    sb a6, 0(a2)
+; RV64I-NEXT:    srli a5, a6, 8
+; RV64I-NEXT:    sb a5, 1(a2)
+; RV64I-NEXT:    srli a5, a4, 48
+; RV64I-NEXT:    sb a5, 14(a2)
+; RV64I-NEXT:    srli a5, a4, 40
+; RV64I-NEXT:    sb a5, 13(a2)
+; RV64I-NEXT:    srli a5, a4, 32
+; RV64I-NEXT:    sb a5, 12(a2)
+; RV64I-NEXT:    srli a5, a4, 24
+; RV64I-NEXT:    sb a5, 11(a2)
+; RV64I-NEXT:    srli a5, a4, 16
+; RV64I-NEXT:    sb a5, 10(a2)
+; RV64I-NEXT:    or a0, a4, a0
+; RV64I-NEXT:    sb a4, 8(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 9(a2)
+; RV64I-NEXT:    srli a3, a3, 56
+; RV64I-NEXT:    sb a3, 23(a2)
+; RV64I-NEXT:    srli a1, a1, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    sb a0, 15(a2)
-; RV64I-NEXT:    srli a0, a3, 48
-; RV64I-NEXT:    sb a0, 14(a2)
-; RV64I-NEXT:    srli a0, a3, 40
-; RV64I-NEXT:    sb a0, 13(a2)
-; RV64I-NEXT:    srli a0, a3, 32
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    srli a0, a3, 24
-; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    srli a0, a3, 16
-; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 224
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -112
-; RV32I-NEXT:    sw ra, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a6, 4(a0)
-; RV32I-NEXT:    lbu t4, 5(a0)
-; RV32I-NEXT:    lbu t6, 6(a0)
-; RV32I-NEXT:    lbu s0, 7(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 1(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t3, 3(a0)
-; RV32I-NEXT:    lbu s1, 12(a0)
-; RV32I-NEXT:    lbu t5, 13(a0)
-; RV32I-NEXT:    lbu s3, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s2, 8(a0)
-; RV32I-NEXT:    lbu s7, 9(a0)
-; RV32I-NEXT:    lbu s8, 10(a0)
-; RV32I-NEXT:    lbu s9, 11(a0)
-; RV32I-NEXT:    lbu a3, 21(a0)
-; RV32I-NEXT:    lbu a4, 20(a0)
-; RV32I-NEXT:    lbu a5, 22(a0)
-; RV32I-NEXT:    lbu t2, 23(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or a3, t2, a3
-; RV32I-NEXT:    lbu a4, 17(a0)
-; RV32I-NEXT:    lbu a5, 16(a0)
-; RV32I-NEXT:    lbu t2, 18(a0)
-; RV32I-NEXT:    lbu s4, 19(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli s4, s4, 24
-; RV32I-NEXT:    or a4, t2, a4
-; RV32I-NEXT:    or s10, s4, a4
-; RV32I-NEXT:    lbu a4, 29(a0)
-; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lbu t2, 30(a0)
-; RV32I-NEXT:    lbu s4, 31(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli s4, s4, 24
-; RV32I-NEXT:    or a4, t2, a4
-; RV32I-NEXT:    or s11, s4, a4
-; RV32I-NEXT:    lbu a4, 25(a0)
-; RV32I-NEXT:    lbu a5, 24(a0)
-; RV32I-NEXT:    lbu t2, 26(a0)
-; RV32I-NEXT:    lbu a0, 27(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a4, t2, a4
-; RV32I-NEXT:    or s6, a0, a4
-; RV32I-NEXT:    lbu a0, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    addi sp, sp, -144
+; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 2(a0)
+; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 3(a0)
+; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 5(a0)
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    lbu s8, 1(a1)
+; RV32I-NEXT:    lbu s9, 20(a0)
+; RV32I-NEXT:    lbu s10, 0(a1)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    lbu ra, 2(a1)
+; RV32I-NEXT:    or s8, s8, s10
+; RV32I-NEXT:    lbu s10, 22(a0)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    or s8, ra, s8
+; RV32I-NEXT:    lbu ra, 23(a0)
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or t0, a1, s8
+; RV32I-NEXT:    lbu s8, 24(a0)
+; RV32I-NEXT:    lbu a7, 25(a0)
+; RV32I-NEXT:    lbu a6, 26(a0)
+; RV32I-NEXT:    lbu a5, 27(a0)
+; RV32I-NEXT:    lbu a1, 31(a0)
+; RV32I-NEXT:    lbu a3, 30(a0)
+; RV32I-NEXT:    lbu a4, 29(a0)
+; RV32I-NEXT:    lbu a0, 28(a0)
+; RV32I-NEXT:    sb a1, 59(sp)
+; RV32I-NEXT:    sb a3, 58(sp)
+; RV32I-NEXT:    sb a4, 57(sp)
+; RV32I-NEXT:    sb a0, 56(sp)
+; RV32I-NEXT:    sb a5, 55(sp)
+; RV32I-NEXT:    sb a6, 54(sp)
+; RV32I-NEXT:    sb a7, 53(sp)
+; RV32I-NEXT:    sb s8, 52(sp)
+; RV32I-NEXT:    sb ra, 51(sp)
+; RV32I-NEXT:    sb s10, 50(sp)
+; RV32I-NEXT:    sb s11, 49(sp)
+; RV32I-NEXT:    sb s9, 48(sp)
+; RV32I-NEXT:    sb s7, 47(sp)
+; RV32I-NEXT:    sb s6, 46(sp)
+; RV32I-NEXT:    sb s5, 45(sp)
+; RV32I-NEXT:    sb s4, 44(sp)
+; RV32I-NEXT:    sb zero, 91(sp)
+; RV32I-NEXT:    sb zero, 90(sp)
+; RV32I-NEXT:    sb zero, 89(sp)
+; RV32I-NEXT:    sb zero, 88(sp)
+; RV32I-NEXT:    sb zero, 87(sp)
+; RV32I-NEXT:    sb zero, 86(sp)
+; RV32I-NEXT:    sb zero, 85(sp)
+; RV32I-NEXT:    sb zero, 84(sp)
+; RV32I-NEXT:    sb zero, 83(sp)
+; RV32I-NEXT:    sb zero, 82(sp)
+; RV32I-NEXT:    sb zero, 81(sp)
+; RV32I-NEXT:    sb zero, 80(sp)
+; RV32I-NEXT:    sb zero, 79(sp)
+; RV32I-NEXT:    sb zero, 78(sp)
+; RV32I-NEXT:    sb zero, 77(sp)
+; RV32I-NEXT:    sb zero, 76(sp)
+; RV32I-NEXT:    sb zero, 75(sp)
+; RV32I-NEXT:    sb zero, 74(sp)
+; RV32I-NEXT:    sb zero, 73(sp)
+; RV32I-NEXT:    sb zero, 72(sp)
+; RV32I-NEXT:    sb zero, 71(sp)
+; RV32I-NEXT:    sb zero, 70(sp)
+; RV32I-NEXT:    sb zero, 69(sp)
+; RV32I-NEXT:    sb zero, 68(sp)
+; RV32I-NEXT:    sb zero, 67(sp)
+; RV32I-NEXT:    sb zero, 66(sp)
+; RV32I-NEXT:    sb zero, 65(sp)
+; RV32I-NEXT:    sb zero, 64(sp)
+; RV32I-NEXT:    sb zero, 63(sp)
+; RV32I-NEXT:    sb zero, 62(sp)
+; RV32I-NEXT:    sb zero, 61(sp)
+; RV32I-NEXT:    sb zero, 60(sp)
+; RV32I-NEXT:    sb s3, 43(sp)
+; RV32I-NEXT:    sb s2, 42(sp)
+; RV32I-NEXT:    sb s1, 41(sp)
+; RV32I-NEXT:    sb s0, 40(sp)
+; RV32I-NEXT:    sb t6, 39(sp)
+; RV32I-NEXT:    sb t5, 38(sp)
+; RV32I-NEXT:    sb t4, 37(sp)
+; RV32I-NEXT:    sb t3, 36(sp)
+; RV32I-NEXT:    sb t2, 35(sp)
+; RV32I-NEXT:    sb t1, 34(sp)
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 33(sp)
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 32(sp)
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 31(sp)
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 30(sp)
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 29(sp)
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 28(sp)
+; RV32I-NEXT:    slli a0, t0, 24
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    addi a4, sp, 28
+; RV32I-NEXT:    add a4, a4, a0
+; RV32I-NEXT:    lbu a0, 5(a4)
+; RV32I-NEXT:    lbu a1, 4(a4)
+; RV32I-NEXT:    lbu a3, 6(a4)
+; RV32I-NEXT:    lbu a5, 7(a4)
 ; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    or t4, a5, a0
+; RV32I-NEXT:    andi a3, t0, 7
+; RV32I-NEXT:    lbu a0, 9(a4)
+; RV32I-NEXT:    lbu a1, 8(a4)
+; RV32I-NEXT:    lbu a5, 10(a4)
+; RV32I-NEXT:    lbu a6, 11(a4)
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a0, a5, a0
-; RV32I-NEXT:    or a5, a1, a0
-; RV32I-NEXT:    srl a0, s6, a5
-; RV32I-NEXT:    not s4, a5
-; RV32I-NEXT:    slli a1, s11, 1
-; RV32I-NEXT:    sll a1, a1, s4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    addi a4, a5, -224
-; RV32I-NEXT:    sw s11, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    srl a1, s11, a5
-; RV32I-NEXT:    sw a0, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw a4, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz a4, .LBB9_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:  .LBB9_2:
-; RV32I-NEXT:    sw a1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    slli t2, t5, 8
-; RV32I-NEXT:    slli s3, s3, 16
-; RV32I-NEXT:    slli s11, s5, 24
-; RV32I-NEXT:    slli ra, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    srl a1, s10, a5
-; RV32I-NEXT:    slli t5, a3, 1
-; RV32I-NEXT:    sll a4, t5, s4
-; RV32I-NEXT:    or s5, a1, a4
-; RV32I-NEXT:    addi s7, a5, -160
-; RV32I-NEXT:    srl a1, a3, a5
-; RV32I-NEXT:    sw a1, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz s7, .LBB9_4
-; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    lw s5, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB9_4:
-; RV32I-NEXT:    slli a4, t4, 8
-; RV32I-NEXT:    slli t6, t6, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or t2, t2, s1
-; RV32I-NEXT:    or s1, s11, s3
-; RV32I-NEXT:    or s2, ra, s2
-; RV32I-NEXT:    or s3, s9, s8
-; RV32I-NEXT:    neg s11, a5
-; RV32I-NEXT:    sll t4, s6, s11
-; RV32I-NEXT:    li a1, 160
-; RV32I-NEXT:    addi s8, a5, -128
-; RV32I-NEXT:    li s9, 64
-; RV32I-NEXT:    sub a1, a1, a5
-; RV32I-NEXT:    bltu s8, s9, .LBB9_6
-; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    mv ra, t4
-; RV32I-NEXT:    j .LBB9_7
-; RV32I-NEXT:  .LBB9_6:
-; RV32I-NEXT:    slti a0, a1, 0
-; RV32I-NEXT:    neg a0, a0
-; RV32I-NEXT:    mv ra, t4
-; RV32I-NEXT:    and a0, a0, t4
-; RV32I-NEXT:    or a0, s5, a0
-; RV32I-NEXT:  .LBB9_7:
-; RV32I-NEXT:    sw a1, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli s5, t3, 24
-; RV32I-NEXT:    or t3, a4, a6
-; RV32I-NEXT:    or t6, s0, t6
-; RV32I-NEXT:    or a6, s1, t2
-; RV32I-NEXT:    or s3, s3, s2
-; RV32I-NEXT:    mv s9, s10
-; RV32I-NEXT:    beqz s8, .LBB9_9
-; RV32I-NEXT:  # %bb.8:
-; RV32I-NEXT:    mv s9, a0
-; RV32I-NEXT:  .LBB9_9:
-; RV32I-NEXT:    or a4, t0, a7
-; RV32I-NEXT:    or t0, s5, t1
-; RV32I-NEXT:    or a0, t6, t3
-; RV32I-NEXT:    srl a1, s3, a5
-; RV32I-NEXT:    slli a7, a6, 1
-; RV32I-NEXT:    sll a7, a7, s4
-; RV32I-NEXT:    or s2, a1, a7
-; RV32I-NEXT:    addi t2, a5, -96
-; RV32I-NEXT:    srl a1, a6, a5
-; RV32I-NEXT:    sw a1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv a7, s2
-; RV32I-NEXT:    bltz t2, .LBB9_11
-; RV32I-NEXT:  # %bb.10:
-; RV32I-NEXT:    lw a7, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB9_11:
-; RV32I-NEXT:    or t0, t0, a4
-; RV32I-NEXT:    addi t6, a5, -32
-; RV32I-NEXT:    srl t1, a0, a5
-; RV32I-NEXT:    sw t1, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgez t6, .LBB9_13
-; RV32I-NEXT:  # %bb.12:
-; RV32I-NEXT:    srl a1, t0, a5
-; RV32I-NEXT:    slli a4, a0, 1
-; RV32I-NEXT:    sll a4, a4, s4
-; RV32I-NEXT:    or t1, a1, a4
-; RV32I-NEXT:  .LBB9_13:
-; RV32I-NEXT:    sll t3, s3, s11
-; RV32I-NEXT:    li a4, 32
-; RV32I-NEXT:    sub s0, a4, a5
-; RV32I-NEXT:    slti a1, s0, 0
-; RV32I-NEXT:    neg s1, a1
-; RV32I-NEXT:    li a1, 64
-; RV32I-NEXT:    bgeu a5, a1, .LBB9_15
-; RV32I-NEXT:  # %bb.14:
-; RV32I-NEXT:    and a1, s1, t3
-; RV32I-NEXT:    or a7, t1, a1
-; RV32I-NEXT:  .LBB9_15:
-; RV32I-NEXT:    mv t4, s7
-; RV32I-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s4, t0
-; RV32I-NEXT:    beqz a5, .LBB9_17
-; RV32I-NEXT:  # %bb.16:
-; RV32I-NEXT:    mv s4, a7
-; RV32I-NEXT:  .LBB9_17:
-; RV32I-NEXT:    sll s1, s10, s11
-; RV32I-NEXT:    li a1, 96
-; RV32I-NEXT:    sub s5, a1, a5
-; RV32I-NEXT:    slti a1, s5, 0
-; RV32I-NEXT:    neg a7, a1
-; RV32I-NEXT:    li s7, 128
-; RV32I-NEXT:    sub t1, s7, a5
-; RV32I-NEXT:    sltiu a1, t1, 64
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgeu a5, s7, .LBB9_19
-; RV32I-NEXT:  # %bb.18:
-; RV32I-NEXT:    mv s7, a1
-; RV32I-NEXT:    and a1, a7, s1
-; RV32I-NEXT:    and a1, s7, a1
-; RV32I-NEXT:    or s9, s4, a1
-; RV32I-NEXT:  .LBB9_19:
-; RV32I-NEXT:    mv s4, ra
-; RV32I-NEXT:    mv s7, t4
-; RV32I-NEXT:    li ra, 64
-; RV32I-NEXT:    beqz a5, .LBB9_21
-; RV32I-NEXT:  # %bb.20:
-; RV32I-NEXT:    mv t0, s9
-; RV32I-NEXT:  .LBB9_21:
-; RV32I-NEXT:    neg a1, t1
-; RV32I-NEXT:    sub a4, a4, t1
-; RV32I-NEXT:    srl t4, a3, a1
-; RV32I-NEXT:    sw a4, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz a4, .LBB9_24
-; RV32I-NEXT:  # %bb.22:
-; RV32I-NEXT:    mv a1, t4
-; RV32I-NEXT:    lw t5, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgeu t1, ra, .LBB9_25
-; RV32I-NEXT:  .LBB9_23:
-; RV32I-NEXT:    and a4, a7, s4
-; RV32I-NEXT:    or a7, a4, a1
-; RV32I-NEXT:    mv a4, s6
-; RV32I-NEXT:    bnez t1, .LBB9_26
-; RV32I-NEXT:    j .LBB9_27
-; RV32I-NEXT:  .LBB9_24:
-; RV32I-NEXT:    srl a1, s10, a1
-; RV32I-NEXT:    sub a4, ra, t1
-; RV32I-NEXT:    not a4, a4
-; RV32I-NEXT:    sll a4, t5, a4
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    lw t5, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltu t1, ra, .LBB9_23
-; RV32I-NEXT:  .LBB9_25:
-; RV32I-NEXT:    lw a1, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a7, a1, s1
-; RV32I-NEXT:    mv a4, s6
-; RV32I-NEXT:    beqz t1, .LBB9_27
-; RV32I-NEXT:  .LBB9_26:
-; RV32I-NEXT:    mv a4, a7
-; RV32I-NEXT:  .LBB9_27:
-; RV32I-NEXT:    bltz t6, .LBB9_29
-; RV32I-NEXT:  # %bb.28:
-; RV32I-NEXT:    lw s2, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB9_29:
-; RV32I-NEXT:    sltiu a1, a5, 64
-; RV32I-NEXT:    mv a7, t5
-; RV32I-NEXT:    bltz s7, .LBB9_31
-; RV32I-NEXT:  # %bb.30:
-; RV32I-NEXT:    lw a7, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB9_31:
-; RV32I-NEXT:    neg s9, a1
-; RV32I-NEXT:    sltiu a1, s8, 64
-; RV32I-NEXT:    neg t5, a1
-; RV32I-NEXT:    li a1, 128
-; RV32I-NEXT:    bltu a5, a1, .LBB9_33
-; RV32I-NEXT:  # %bb.32:
-; RV32I-NEXT:    and a4, t5, a7
-; RV32I-NEXT:    mv s2, s3
-; RV32I-NEXT:    bnez a5, .LBB9_34
-; RV32I-NEXT:    j .LBB9_35
-; RV32I-NEXT:  .LBB9_33:
-; RV32I-NEXT:    and a1, s9, s2
-; RV32I-NEXT:    or a4, a1, a4
-; RV32I-NEXT:    mv s2, s3
-; RV32I-NEXT:    beqz a5, .LBB9_35
-; RV32I-NEXT:  .LBB9_34:
-; RV32I-NEXT:    mv s2, a4
-; RV32I-NEXT:  .LBB9_35:
-; RV32I-NEXT:    sub a1, ra, a5
-; RV32I-NEXT:    not a7, a1
-; RV32I-NEXT:    bgez s0, .LBB9_37
-; RV32I-NEXT:  # %bb.36:
-; RV32I-NEXT:    sll a1, a6, s11
-; RV32I-NEXT:    srli a4, s3, 1
-; RV32I-NEXT:    srl a4, a4, a7
-; RV32I-NEXT:    or t3, a1, a4
-; RV32I-NEXT:  .LBB9_37:
-; RV32I-NEXT:    slti a1, t6, 0
-; RV32I-NEXT:    neg s3, a1
-; RV32I-NEXT:    slti a1, t2, 0
-; RV32I-NEXT:    neg a4, a1
-; RV32I-NEXT:    sw a7, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw a4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltu a5, ra, .LBB9_39
-; RV32I-NEXT:  # %bb.38:
-; RV32I-NEXT:    lw a1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a4, a4, a1
-; RV32I-NEXT:    j .LBB9_40
-; RV32I-NEXT:  .LBB9_39:
-; RV32I-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, s3, a1
-; RV32I-NEXT:    or a4, a1, t3
-; RV32I-NEXT:  .LBB9_40:
-; RV32I-NEXT:    sw t5, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw t4, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv a7, a0
-; RV32I-NEXT:    beqz a5, .LBB9_42
-; RV32I-NEXT:  # %bb.41:
-; RV32I-NEXT:    mv a7, a4
-; RV32I-NEXT:  .LBB9_42:
-; RV32I-NEXT:    mv t4, t2
-; RV32I-NEXT:    mv ra, s4
-; RV32I-NEXT:    sll s4, a3, s11
-; RV32I-NEXT:    srli a4, s10, 1
-; RV32I-NEXT:    not t5, t1
-; RV32I-NEXT:    bltz s5, .LBB9_44
-; RV32I-NEXT:  # %bb.43:
-; RV32I-NEXT:    mv t2, s1
-; RV32I-NEXT:    j .LBB9_45
-; RV32I-NEXT:  .LBB9_44:
-; RV32I-NEXT:    srl a1, a4, t5
-; RV32I-NEXT:    or t2, s4, a1
-; RV32I-NEXT:  .LBB9_45:
-; RV32I-NEXT:    lw a1, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sll t3, a1, s11
-; RV32I-NEXT:    srli s6, s6, 1
-; RV32I-NEXT:    lw a1, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a1, .LBB9_47
-; RV32I-NEXT:  # %bb.46:
-; RV32I-NEXT:    mv s11, ra
-; RV32I-NEXT:    j .LBB9_48
-; RV32I-NEXT:  .LBB9_47:
-; RV32I-NEXT:    li a1, 192
-; RV32I-NEXT:    sub a1, a1, a5
-; RV32I-NEXT:    not a1, a1
-; RV32I-NEXT:    srl a1, s6, a1
-; RV32I-NEXT:    or s11, t3, a1
-; RV32I-NEXT:  .LBB9_48:
-; RV32I-NEXT:    slti a1, s7, 0
-; RV32I-NEXT:    neg s7, a1
-; RV32I-NEXT:    li a1, 64
-; RV32I-NEXT:    bltu s8, a1, .LBB9_50
-; RV32I-NEXT:  # %bb.49:
-; RV32I-NEXT:    lw a1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slti a1, a1, 0
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    lw s11, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, a1, s11
-; RV32I-NEXT:    mv s11, a3
-; RV32I-NEXT:    bnez s8, .LBB9_51
-; RV32I-NEXT:    j .LBB9_52
-; RV32I-NEXT:  .LBB9_50:
-; RV32I-NEXT:    lw a1, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, s7, a1
-; RV32I-NEXT:    or a1, a1, s11
-; RV32I-NEXT:    mv s11, a3
-; RV32I-NEXT:    beqz s8, .LBB9_52
-; RV32I-NEXT:  .LBB9_51:
-; RV32I-NEXT:    mv s11, a1
-; RV32I-NEXT:  .LBB9_52:
-; RV32I-NEXT:    li a1, 128
-; RV32I-NEXT:    bltu a5, a1, .LBB9_57
-; RV32I-NEXT:  # %bb.53:
-; RV32I-NEXT:    bnez a5, .LBB9_58
-; RV32I-NEXT:  .LBB9_54:
-; RV32I-NEXT:    bltz s0, .LBB9_59
-; RV32I-NEXT:  .LBB9_55:
-; RV32I-NEXT:    bltz s5, .LBB9_60
-; RV32I-NEXT:  .LBB9_56:
-; RV32I-NEXT:    mv a4, ra
-; RV32I-NEXT:    j .LBB9_61
-; RV32I-NEXT:  .LBB9_57:
-; RV32I-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, a1, t2
-; RV32I-NEXT:    or s11, a7, a1
-; RV32I-NEXT:    beqz a5, .LBB9_54
-; RV32I-NEXT:  .LBB9_58:
-; RV32I-NEXT:    mv a0, s11
-; RV32I-NEXT:    bgez s0, .LBB9_55
-; RV32I-NEXT:  .LBB9_59:
-; RV32I-NEXT:    lw a1, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    srl a1, a4, a1
-; RV32I-NEXT:    or s1, s4, a1
-; RV32I-NEXT:    bgez s5, .LBB9_56
-; RV32I-NEXT:  .LBB9_60:
-; RV32I-NEXT:    srl a1, s6, t5
-; RV32I-NEXT:    or a4, t3, a1
-; RV32I-NEXT:  .LBB9_61:
-; RV32I-NEXT:    lw t5, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    li a1, 64
-; RV32I-NEXT:    bltu t1, a1, .LBB9_65
-; RV32I-NEXT:  # %bb.62:
-; RV32I-NEXT:    bnez t1, .LBB9_66
-; RV32I-NEXT:  .LBB9_63:
-; RV32I-NEXT:    li a1, 128
-; RV32I-NEXT:    bltu a5, a1, .LBB9_67
-; RV32I-NEXT:  .LBB9_64:
-; RV32I-NEXT:    lw t2, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, s7, t2
-; RV32I-NEXT:    lw a4, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, a4, a1
-; RV32I-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bnez a5, .LBB9_68
-; RV32I-NEXT:    j .LBB9_69
-; RV32I-NEXT:  .LBB9_65:
-; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slti a1, a1, 0
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    lw t2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, a1, t2
-; RV32I-NEXT:    or s1, a4, a1
-; RV32I-NEXT:    beqz t1, .LBB9_63
-; RV32I-NEXT:  .LBB9_66:
-; RV32I-NEXT:    sw s1, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    li a1, 128
-; RV32I-NEXT:    bgeu a5, a1, .LBB9_64
-; RV32I-NEXT:  .LBB9_67:
-; RV32I-NEXT:    lw a1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, s3, a1
-; RV32I-NEXT:    and a1, s9, a1
-; RV32I-NEXT:    lw a4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    lw t2, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    beqz a5, .LBB9_69
-; RV32I-NEXT:  .LBB9_68:
-; RV32I-NEXT:    mv a6, a1
-; RV32I-NEXT:  .LBB9_69:
-; RV32I-NEXT:    mv a4, t5
-; RV32I-NEXT:    bgez t4, .LBB9_76
-; RV32I-NEXT:  # %bb.70:
-; RV32I-NEXT:    bgez t6, .LBB9_77
-; RV32I-NEXT:  .LBB9_71:
-; RV32I-NEXT:    li t1, 64
-; RV32I-NEXT:    bltu a5, t1, .LBB9_78
-; RV32I-NEXT:  .LBB9_72:
-; RV32I-NEXT:    bnez a5, .LBB9_79
-; RV32I-NEXT:  .LBB9_73:
-; RV32I-NEXT:    bltz s0, .LBB9_80
-; RV32I-NEXT:  .LBB9_74:
-; RV32I-NEXT:    sltiu a4, a5, 128
-; RV32I-NEXT:    bltu a5, t1, .LBB9_81
-; RV32I-NEXT:  .LBB9_75:
-; RV32I-NEXT:    lw a1, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and t1, a1, t2
-; RV32I-NEXT:    neg a7, a4
-; RV32I-NEXT:    bnez a5, .LBB9_82
-; RV32I-NEXT:    j .LBB9_83
-; RV32I-NEXT:  .LBB9_76:
-; RV32I-NEXT:    mv a4, t2
-; RV32I-NEXT:    bltz t6, .LBB9_71
-; RV32I-NEXT:  .LBB9_77:
-; RV32I-NEXT:    lw a1, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a1, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    li t1, 64
-; RV32I-NEXT:    bgeu a5, t1, .LBB9_72
-; RV32I-NEXT:  .LBB9_78:
-; RV32I-NEXT:    and a1, s1, ra
-; RV32I-NEXT:    lw a4, 32(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, a1
-; RV32I-NEXT:    beqz a5, .LBB9_73
-; RV32I-NEXT:  .LBB9_79:
-; RV32I-NEXT:    mv s10, a4
-; RV32I-NEXT:    bgez s0, .LBB9_74
-; RV32I-NEXT:  .LBB9_80:
-; RV32I-NEXT:    lw a1, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    srl a1, s6, a1
-; RV32I-NEXT:    or ra, t3, a1
-; RV32I-NEXT:    sltiu a4, a5, 128
-; RV32I-NEXT:    bgeu a5, t1, .LBB9_75
-; RV32I-NEXT:  .LBB9_81:
-; RV32I-NEXT:    lw a1, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, s3, a1
-; RV32I-NEXT:    or t1, a1, ra
-; RV32I-NEXT:    neg a7, a4
-; RV32I-NEXT:    beqz a5, .LBB9_83
-; RV32I-NEXT:  .LBB9_82:
-; RV32I-NEXT:    mv a3, t1
-; RV32I-NEXT:  .LBB9_83:
-; RV32I-NEXT:    and a4, a7, s10
-; RV32I-NEXT:    and a3, a7, a3
-; RV32I-NEXT:    bltz t6, .LBB9_85
-; RV32I-NEXT:  # %bb.84:
-; RV32I-NEXT:    mv t5, t2
-; RV32I-NEXT:  .LBB9_85:
-; RV32I-NEXT:    and a1, a7, t5
-; RV32I-NEXT:    and a1, a1, s9
-; RV32I-NEXT:    and a5, s3, t2
-; RV32I-NEXT:    and a5, a7, a5
-; RV32I-NEXT:    and a5, a5, s9
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    srli a7, a1, 24
-; RV32I-NEXT:    sb a7, 27(a2)
-; RV32I-NEXT:    srli a7, a1, 16
-; RV32I-NEXT:    sb a7, 26(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 25(a2)
-; RV32I-NEXT:    srli a1, a5, 24
-; RV32I-NEXT:    sb a1, 31(a2)
-; RV32I-NEXT:    srli a1, a5, 16
-; RV32I-NEXT:    sb a1, 30(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 29(a2)
-; RV32I-NEXT:    sb a4, 16(a2)
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 19(a2)
-; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 18(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 17(a2)
-; RV32I-NEXT:    sb a3, 20(a2)
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 23(a2)
-; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 22(a2)
+; RV32I-NEXT:    or a6, a6, a0
+; RV32I-NEXT:    slli a0, a6, 1
+; RV32I-NEXT:    not t0, a3
+; RV32I-NEXT:    sll a0, a0, t0
+; RV32I-NEXT:    lbu a1, 1(a4)
+; RV32I-NEXT:    lbu a5, 0(a4)
+; RV32I-NEXT:    lbu a7, 2(a4)
+; RV32I-NEXT:    lbu t1, 3(a4)
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or t1, t1, a1
+; RV32I-NEXT:    slli a1, t4, 1
+; RV32I-NEXT:    xori t2, a3, 31
+; RV32I-NEXT:    sll a1, a1, t2
+; RV32I-NEXT:    lbu a5, 13(a4)
+; RV32I-NEXT:    lbu a7, 12(a4)
+; RV32I-NEXT:    lbu t3, 14(a4)
+; RV32I-NEXT:    lbu t5, 15(a4)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    or a5, t3, a5
+; RV32I-NEXT:    or t3, t5, a5
+; RV32I-NEXT:    lbu a5, 17(a4)
+; RV32I-NEXT:    lbu a7, 16(a4)
+; RV32I-NEXT:    lbu t5, 18(a4)
+; RV32I-NEXT:    lbu t6, 19(a4)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a5, t5, a5
+; RV32I-NEXT:    or a5, t6, a5
+; RV32I-NEXT:    slli a7, a5, 1
+; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    lbu t5, 21(a4)
+; RV32I-NEXT:    lbu t6, 20(a4)
+; RV32I-NEXT:    lbu s0, 22(a4)
+; RV32I-NEXT:    lbu s1, 23(a4)
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    or t5, t5, t6
+; RV32I-NEXT:    slli s0, s0, 16
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or t5, s0, t5
+; RV32I-NEXT:    or t5, s1, t5
+; RV32I-NEXT:    lbu t6, 25(a4)
+; RV32I-NEXT:    lbu s0, 24(a4)
+; RV32I-NEXT:    lbu s1, 26(a4)
+; RV32I-NEXT:    lbu s2, 27(a4)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or t6, t6, s0
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    or t6, s1, t6
+; RV32I-NEXT:    or t6, s2, t6
+; RV32I-NEXT:    lbu s0, 29(a4)
+; RV32I-NEXT:    slli s1, t6, 1
+; RV32I-NEXT:    lbu s2, 28(a4)
+; RV32I-NEXT:    sll t0, s1, t0
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    lbu s1, 30(a4)
+; RV32I-NEXT:    or s0, s0, s2
+; RV32I-NEXT:    slli s2, t3, 1
+; RV32I-NEXT:    sll s2, s2, t2
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    lbu a4, 31(a4)
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli s1, t5, 1
+; RV32I-NEXT:    sll s1, s1, t2
+; RV32I-NEXT:    slli a4, a4, 24
+; RV32I-NEXT:    or a4, a4, s0
+; RV32I-NEXT:    slli s0, a4, 1
+; RV32I-NEXT:    sll t2, s0, t2
+; RV32I-NEXT:    srl t4, t4, a3
+; RV32I-NEXT:    srl t1, t1, a3
+; RV32I-NEXT:    srl t3, t3, a3
+; RV32I-NEXT:    srl a6, a6, a3
+; RV32I-NEXT:    srl t5, t5, a3
+; RV32I-NEXT:    srl a5, a5, a3
+; RV32I-NEXT:    srl t6, t6, a3
+; RV32I-NEXT:    srl a3, a4, a3
+; RV32I-NEXT:    srli a4, t6, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    or a4, t6, t2
+; RV32I-NEXT:    sb t6, 24(a2)
+; RV32I-NEXT:    srli t2, t6, 8
+; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    srli t2, a3, 24
+; RV32I-NEXT:    sb t2, 31(a2)
+; RV32I-NEXT:    srli t2, a3, 16
+; RV32I-NEXT:    sb t2, 30(a2)
+; RV32I-NEXT:    sb a3, 28(a2)
 ; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 21(a2)
-; RV32I-NEXT:    sb t0, 0(a2)
-; RV32I-NEXT:    sb a6, 12(a2)
-; RV32I-NEXT:    srli a1, t0, 24
+; RV32I-NEXT:    sb a3, 29(a2)
+; RV32I-NEXT:    srli a3, a5, 16
+; RV32I-NEXT:    sb a3, 18(a2)
+; RV32I-NEXT:    or s1, a5, s1
+; RV32I-NEXT:    sb a5, 16(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 17(a2)
+; RV32I-NEXT:    srli a3, t5, 16
+; RV32I-NEXT:    sb a3, 22(a2)
+; RV32I-NEXT:    or a3, t5, t0
+; RV32I-NEXT:    sb t5, 20(a2)
+; RV32I-NEXT:    srli a5, t5, 8
+; RV32I-NEXT:    sb a5, 21(a2)
+; RV32I-NEXT:    srli a5, a6, 16
+; RV32I-NEXT:    sb a5, 10(a2)
+; RV32I-NEXT:    or a5, a6, s2
+; RV32I-NEXT:    sb a6, 8(a2)
+; RV32I-NEXT:    srli a6, a6, 8
+; RV32I-NEXT:    sb a6, 9(a2)
+; RV32I-NEXT:    srli a6, t3, 16
+; RV32I-NEXT:    sb a6, 14(a2)
+; RV32I-NEXT:    or a6, t3, a7
+; RV32I-NEXT:    sb t3, 12(a2)
+; RV32I-NEXT:    srli a7, t3, 8
+; RV32I-NEXT:    sb a7, 13(a2)
+; RV32I-NEXT:    srli a7, t1, 16
+; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    or a1, t1, a1
+; RV32I-NEXT:    sb t1, 0(a2)
+; RV32I-NEXT:    srli a7, t1, 8
+; RV32I-NEXT:    sb a7, 1(a2)
+; RV32I-NEXT:    srli a7, t4, 16
+; RV32I-NEXT:    sb a7, 6(a2)
+; RV32I-NEXT:    or a0, t4, a0
+; RV32I-NEXT:    sb t4, 4(a2)
+; RV32I-NEXT:    srli a7, t4, 8
+; RV32I-NEXT:    sb a7, 5(a2)
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    sb a4, 27(a2)
+; RV32I-NEXT:    srli s1, s1, 24
+; RV32I-NEXT:    sb s1, 19(a2)
+; RV32I-NEXT:    srli a3, a3, 24
+; RV32I-NEXT:    sb a3, 23(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a3, a6, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a1, t0, 16
-; RV32I-NEXT:    sb a1, 2(a2)
-; RV32I-NEXT:    srli a1, t0, 8
-; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    sb s2, 8(a2)
-; RV32I-NEXT:    srli a1, a6, 24
-; RV32I-NEXT:    sb a1, 15(a2)
-; RV32I-NEXT:    srli a1, a6, 16
-; RV32I-NEXT:    sb a1, 14(a2)
-; RV32I-NEXT:    srli a1, a6, 8
-; RV32I-NEXT:    sb a1, 13(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    sb a1, 6(a2)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, s2, 24
-; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a0, s2, 16
-; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    srli a0, s2, 8
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    lw ra, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 112
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 144
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2295,815 +2108,654 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 17(a0)
-; RV64I-NEXT:    lbu a4, 16(a0)
-; RV64I-NEXT:    lbu a5, 18(a0)
-; RV64I-NEXT:    lbu a6, 19(a0)
+; RV64I-NEXT:    addi sp, sp, -224
+; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 2(a0)
+; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 3(a0)
+; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 4(a0)
+; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 5(a0)
+; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    lbu s9, 1(a1)
+; RV64I-NEXT:    lbu s10, 0(a1)
+; RV64I-NEXT:    lbu s11, 2(a1)
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s9, s9, s10
+; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    lbu s10, 5(a1)
+; RV64I-NEXT:    lbu ra, 4(a1)
+; RV64I-NEXT:    or s9, s11, s9
+; RV64I-NEXT:    lbu s11, 6(a1)
+; RV64I-NEXT:    slli s10, s10, 8
+; RV64I-NEXT:    or s10, s10, ra
+; RV64I-NEXT:    lbu ra, 7(a1)
+; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    or s10, s11, s10
+; RV64I-NEXT:    lbu s11, 21(a0)
+; RV64I-NEXT:    slli ra, ra, 24
+; RV64I-NEXT:    or s10, ra, s10
+; RV64I-NEXT:    lbu ra, 22(a0)
+; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli s10, s10, 32
+; RV64I-NEXT:    or s9, s10, s9
+; RV64I-NEXT:    lbu s10, 23(a0)
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or t0, s9, a1
+; RV64I-NEXT:    lbu s9, 24(a0)
+; RV64I-NEXT:    lbu a7, 25(a0)
+; RV64I-NEXT:    lbu a6, 26(a0)
+; RV64I-NEXT:    lbu a5, 27(a0)
+; RV64I-NEXT:    lbu a1, 31(a0)
+; RV64I-NEXT:    lbu a3, 30(a0)
+; RV64I-NEXT:    lbu a4, 29(a0)
+; RV64I-NEXT:    lbu a0, 28(a0)
+; RV64I-NEXT:    sb a1, 119(sp)
+; RV64I-NEXT:    sb a3, 118(sp)
+; RV64I-NEXT:    sb a4, 117(sp)
+; RV64I-NEXT:    sb a0, 116(sp)
+; RV64I-NEXT:    sb a5, 115(sp)
+; RV64I-NEXT:    sb a6, 114(sp)
+; RV64I-NEXT:    sb a7, 113(sp)
+; RV64I-NEXT:    sb s9, 112(sp)
+; RV64I-NEXT:    sb s10, 111(sp)
+; RV64I-NEXT:    sb ra, 110(sp)
+; RV64I-NEXT:    sb s11, 109(sp)
+; RV64I-NEXT:    sb s8, 108(sp)
+; RV64I-NEXT:    sb s7, 107(sp)
+; RV64I-NEXT:    sb s6, 106(sp)
+; RV64I-NEXT:    sb s5, 105(sp)
+; RV64I-NEXT:    sb s4, 104(sp)
+; RV64I-NEXT:    sb s3, 103(sp)
+; RV64I-NEXT:    sb s2, 102(sp)
+; RV64I-NEXT:    sb s1, 101(sp)
+; RV64I-NEXT:    sb s0, 100(sp)
+; RV64I-NEXT:    sb t6, 99(sp)
+; RV64I-NEXT:    sb t5, 98(sp)
+; RV64I-NEXT:    sb t4, 97(sp)
+; RV64I-NEXT:    sb zero, 87(sp)
+; RV64I-NEXT:    sb zero, 86(sp)
+; RV64I-NEXT:    sb zero, 85(sp)
+; RV64I-NEXT:    sb zero, 84(sp)
+; RV64I-NEXT:    sb zero, 83(sp)
+; RV64I-NEXT:    sb zero, 82(sp)
+; RV64I-NEXT:    sb zero, 81(sp)
+; RV64I-NEXT:    sb zero, 80(sp)
+; RV64I-NEXT:    sb zero, 79(sp)
+; RV64I-NEXT:    sb zero, 78(sp)
+; RV64I-NEXT:    sb zero, 77(sp)
+; RV64I-NEXT:    sb zero, 76(sp)
+; RV64I-NEXT:    sb zero, 75(sp)
+; RV64I-NEXT:    sb zero, 74(sp)
+; RV64I-NEXT:    sb zero, 73(sp)
+; RV64I-NEXT:    sb zero, 72(sp)
+; RV64I-NEXT:    sb zero, 71(sp)
+; RV64I-NEXT:    sb zero, 70(sp)
+; RV64I-NEXT:    sb zero, 69(sp)
+; RV64I-NEXT:    sb zero, 68(sp)
+; RV64I-NEXT:    sb zero, 67(sp)
+; RV64I-NEXT:    sb zero, 66(sp)
+; RV64I-NEXT:    sb zero, 65(sp)
+; RV64I-NEXT:    sb zero, 64(sp)
+; RV64I-NEXT:    sb zero, 63(sp)
+; RV64I-NEXT:    sb zero, 62(sp)
+; RV64I-NEXT:    sb zero, 61(sp)
+; RV64I-NEXT:    sb zero, 60(sp)
+; RV64I-NEXT:    sb zero, 59(sp)
+; RV64I-NEXT:    sb zero, 58(sp)
+; RV64I-NEXT:    sb zero, 57(sp)
+; RV64I-NEXT:    sb zero, 56(sp)
+; RV64I-NEXT:    sb t3, 96(sp)
+; RV64I-NEXT:    sb t2, 95(sp)
+; RV64I-NEXT:    sb t1, 94(sp)
+; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 93(sp)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 92(sp)
+; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 91(sp)
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 90(sp)
+; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 89(sp)
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 88(sp)
+; RV64I-NEXT:    slli a0, t0, 56
+; RV64I-NEXT:    srli a0, a0, 59
+; RV64I-NEXT:    addi a1, sp, 88
+; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    lbu a1, 9(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 10(a0)
+; RV64I-NEXT:    lbu a5, 11(a0)
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a1, a4, a1
+; RV64I-NEXT:    lbu a3, 13(a0)
+; RV64I-NEXT:    lbu a4, 12(a0)
+; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a7, 15(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a3, a7, a3
+; RV64I-NEXT:    slli a3, a3, 32
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    or a3, a1, a5
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a4, 21(a0)
-; RV64I-NEXT:    lbu a5, 20(a0)
-; RV64I-NEXT:    lbu a7, 22(a0)
-; RV64I-NEXT:    lbu t0, 23(a0)
+; RV64I-NEXT:    or a1, a5, a1
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu t1, 7(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    or a4, t1, a4
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a3, a3, a6
+; RV64I-NEXT:    or a1, a4, a1
 ; RV64I-NEXT:    lbu a4, 25(a0)
 ; RV64I-NEXT:    lbu a5, 24(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a7, 27(a0)
+; RV64I-NEXT:    lbu a7, 26(a0)
+; RV64I-NEXT:    or a6, a1, a6
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or a5, a7, a4
-; RV64I-NEXT:    lbu a4, 29(a0)
-; RV64I-NEXT:    lbu a6, 28(a0)
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu t0, 31(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
 ; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    lbu a1, 29(a0)
+; RV64I-NEXT:    lbu a5, 27(a0)
 ; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or t0, t0, a4
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a6, 0(a0)
-; RV64I-NEXT:    lbu a7, 2(a0)
-; RV64I-NEXT:    lbu t1, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t2, 6(a0)
-; RV64I-NEXT:    lbu t3, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or a6, t2, a6
-; RV64I-NEXT:    or a6, t3, a6
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or a4, a4, t1
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    lbu a7, 8(a0)
-; RV64I-NEXT:    lbu t1, 10(a0)
-; RV64I-NEXT:    lbu t2, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu a7, 28(a0)
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu t2, 31(a0)
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a5, a5, 24
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    lbu t1, 12(a0)
-; RV64I-NEXT:    lbu t3, 14(a0)
-; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t1
-; RV64I-NEXT:    slli t3, t3, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a7, t3, a7
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    or t1, a0, t2
-; RV64I-NEXT:    lbu a0, 1(a1)
-; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
-; RV64I-NEXT:    lbu t2, 3(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a0, a7, a0
-; RV64I-NEXT:    lbu a6, 5(a1)
-; RV64I-NEXT:    lbu a7, 4(a1)
-; RV64I-NEXT:    lbu t3, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t3, t3, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a6, t3, a6
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, t1, a1
+; RV64I-NEXT:    or a1, t2, a1
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    lbu a4, 17(a0)
+; RV64I-NEXT:    lbu a7, 16(a0)
+; RV64I-NEXT:    lbu t1, 18(a0)
+; RV64I-NEXT:    or a5, a1, a5
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a1, a4, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    or a1, t1, a1
+; RV64I-NEXT:    lbu a4, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t1, 22(a0)
+; RV64I-NEXT:    andi t0, t0, 7
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    lbu a7, 23(a0)
+; RV64I-NEXT:    or a4, t1, a4
+; RV64I-NEXT:    srli t1, a6, 1
+; RV64I-NEXT:    lbu t2, 19(a0)
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    xori a7, t0, 63
+; RV64I-NEXT:    srl a0, t1, a7
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a1, a4, a1
+; RV64I-NEXT:    or a4, a1, t2
+; RV64I-NEXT:    srli a1, a4, 1
+; RV64I-NEXT:    srl a7, a1, a7
+; RV64I-NEXT:    srli a1, a3, 1
+; RV64I-NEXT:    not t1, t0
+; RV64I-NEXT:    srl t1, a1, t1
+; RV64I-NEXT:    sll a1, a3, t0
+; RV64I-NEXT:    sll a3, a5, t0
+; RV64I-NEXT:    sll a4, a4, t0
+; RV64I-NEXT:    sll a5, a6, t0
+; RV64I-NEXT:    srli a6, a4, 56
+; RV64I-NEXT:    sb a6, 23(a2)
+; RV64I-NEXT:    srli a6, a4, 48
+; RV64I-NEXT:    sb a6, 22(a2)
+; RV64I-NEXT:    srli a6, a4, 40
+; RV64I-NEXT:    sb a6, 21(a2)
+; RV64I-NEXT:    srli a6, a4, 32
+; RV64I-NEXT:    sb a6, 20(a2)
+; RV64I-NEXT:    srli a6, a4, 24
+; RV64I-NEXT:    sb a6, 19(a2)
+; RV64I-NEXT:    srli a6, a4, 16
+; RV64I-NEXT:    sb a6, 18(a2)
+; RV64I-NEXT:    or a6, a4, t1
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 17(a2)
+; RV64I-NEXT:    srli a4, a3, 56
+; RV64I-NEXT:    sb a4, 31(a2)
+; RV64I-NEXT:    srli a4, a3, 48
+; RV64I-NEXT:    sb a4, 30(a2)
+; RV64I-NEXT:    srli a4, a3, 40
+; RV64I-NEXT:    sb a4, 29(a2)
+; RV64I-NEXT:    srli a4, a3, 32
+; RV64I-NEXT:    sb a4, 28(a2)
+; RV64I-NEXT:    srli a4, a3, 24
+; RV64I-NEXT:    sb a4, 27(a2)
+; RV64I-NEXT:    srli a4, a3, 16
+; RV64I-NEXT:    sb a4, 26(a2)
+; RV64I-NEXT:    or a4, a3, a7
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 25(a2)
+; RV64I-NEXT:    srli a3, a5, 56
+; RV64I-NEXT:    sb a3, 7(a2)
+; RV64I-NEXT:    srli a3, a5, 48
+; RV64I-NEXT:    sb a3, 6(a2)
+; RV64I-NEXT:    srli a3, a5, 40
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    srli a3, a5, 32
+; RV64I-NEXT:    sb a3, 4(a2)
+; RV64I-NEXT:    srli a3, a5, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, a5, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    sb a5, 0(a2)
+; RV64I-NEXT:    srli a5, a5, 8
+; RV64I-NEXT:    sb a5, 1(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 15(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 14(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 13(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 12(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 10(a2)
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a1, a0, t2
-; RV64I-NEXT:    sll a0, t1, a1
-; RV64I-NEXT:    not t4, a1
-; RV64I-NEXT:    srli a6, a4, 1
-; RV64I-NEXT:    srl a6, a6, t4
-; RV64I-NEXT:    or a6, a0, a6
-; RV64I-NEXT:    addi t2, a1, -192
-; RV64I-NEXT:    sll a7, a4, a1
-; RV64I-NEXT:    mv t3, a6
-; RV64I-NEXT:    bltz t2, .LBB10_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    mv t3, a7
-; RV64I-NEXT:  .LBB10_2:
-; RV64I-NEXT:    or a0, t0, a5
-; RV64I-NEXT:    addi a5, a1, -64
-; RV64I-NEXT:    sll t0, a3, a1
-; RV64I-NEXT:    bltz a5, .LBB10_4
-; RV64I-NEXT:  # %bb.3:
-; RV64I-NEXT:    mv s1, t0
-; RV64I-NEXT:    j .LBB10_5
-; RV64I-NEXT:  .LBB10_4:
-; RV64I-NEXT:    sll t5, a0, a1
-; RV64I-NEXT:    srli t6, a3, 1
-; RV64I-NEXT:    srl t4, t6, t4
-; RV64I-NEXT:    or s1, t5, t4
-; RV64I-NEXT:  .LBB10_5:
-; RV64I-NEXT:    negw t6, a1
-; RV64I-NEXT:    srl t4, t1, t6
-; RV64I-NEXT:    li s0, 64
-; RV64I-NEXT:    li t5, 128
-; RV64I-NEXT:    sub s0, s0, a1
-; RV64I-NEXT:    bltu a1, t5, .LBB10_11
-; RV64I-NEXT:  # %bb.6:
-; RV64I-NEXT:    bnez a1, .LBB10_12
-; RV64I-NEXT:  .LBB10_7:
-; RV64I-NEXT:    bgez s0, .LBB10_9
-; RV64I-NEXT:  .LBB10_8:
-; RV64I-NEXT:    srl a4, a4, t6
-; RV64I-NEXT:    slli t1, t1, 1
-; RV64I-NEXT:    subw t3, t5, a1
-; RV64I-NEXT:    not t3, t3
-; RV64I-NEXT:    sll t1, t1, t3
-; RV64I-NEXT:    or t4, a4, t1
-; RV64I-NEXT:  .LBB10_9:
-; RV64I-NEXT:    slti a4, a5, 0
-; RV64I-NEXT:    neg a4, a4
-; RV64I-NEXT:    bltu a1, t5, .LBB10_13
-; RV64I-NEXT:  # %bb.10:
-; RV64I-NEXT:    slti t0, t2, 0
-; RV64I-NEXT:    neg t0, t0
-; RV64I-NEXT:    and t0, t0, a7
-; RV64I-NEXT:    bnez a1, .LBB10_14
-; RV64I-NEXT:    j .LBB10_15
-; RV64I-NEXT:  .LBB10_11:
-; RV64I-NEXT:    slti t3, s0, 0
-; RV64I-NEXT:    neg t3, t3
-; RV64I-NEXT:    and t3, t3, t4
-; RV64I-NEXT:    or t3, s1, t3
-; RV64I-NEXT:    beqz a1, .LBB10_7
-; RV64I-NEXT:  .LBB10_12:
-; RV64I-NEXT:    mv a0, t3
-; RV64I-NEXT:    bltz s0, .LBB10_8
-; RV64I-NEXT:    j .LBB10_9
-; RV64I-NEXT:  .LBB10_13:
-; RV64I-NEXT:    and t0, a4, t0
-; RV64I-NEXT:    or t0, t0, t4
-; RV64I-NEXT:    beqz a1, .LBB10_15
-; RV64I-NEXT:  .LBB10_14:
-; RV64I-NEXT:    mv a3, t0
-; RV64I-NEXT:  .LBB10_15:
-; RV64I-NEXT:    bltz a5, .LBB10_17
-; RV64I-NEXT:  # %bb.16:
-; RV64I-NEXT:    mv a6, a7
-; RV64I-NEXT:  .LBB10_17:
-; RV64I-NEXT:    sltiu a1, a1, 128
-; RV64I-NEXT:    neg a1, a1
-; RV64I-NEXT:    and a5, a1, a6
-; RV64I-NEXT:    and a4, a4, a7
-; RV64I-NEXT:    and a1, a1, a4
-; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a5, 8(a2)
-; RV64I-NEXT:    srli a4, a1, 56
-; RV64I-NEXT:    sb a4, 7(a2)
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    sb a4, 6(a2)
-; RV64I-NEXT:    srli a4, a1, 40
-; RV64I-NEXT:    sb a4, 5(a2)
-; RV64I-NEXT:    srli a4, a1, 32
-; RV64I-NEXT:    sb a4, 4(a2)
-; RV64I-NEXT:    srli a4, a1, 24
-; RV64I-NEXT:    sb a4, 3(a2)
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    sb a4, 2(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 1(a2)
-; RV64I-NEXT:    srli a1, a5, 56
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    srli a1, a5, 48
-; RV64I-NEXT:    sb a1, 14(a2)
-; RV64I-NEXT:    srli a1, a5, 40
-; RV64I-NEXT:    sb a1, 13(a2)
-; RV64I-NEXT:    srli a1, a5, 32
-; RV64I-NEXT:    sb a1, 12(a2)
-; RV64I-NEXT:    srli a1, a5, 24
-; RV64I-NEXT:    sb a1, 11(a2)
-; RV64I-NEXT:    srli a1, a5, 16
-; RV64I-NEXT:    sb a1, 10(a2)
-; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 9(a2)
-; RV64I-NEXT:    sb a0, 24(a2)
-; RV64I-NEXT:    sb a3, 16(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    sb a1, 31(a2)
-; RV64I-NEXT:    srli a1, a0, 48
-; RV64I-NEXT:    sb a1, 30(a2)
-; RV64I-NEXT:    srli a1, a0, 40
-; RV64I-NEXT:    sb a1, 29(a2)
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    sb a1, 28(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    sb a1, 27(a2)
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    sb a1, 26(a2)
-; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    sb a0, 25(a2)
-; RV64I-NEXT:    srli a0, a3, 56
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    srli a0, a3, 48
-; RV64I-NEXT:    sb a0, 22(a2)
-; RV64I-NEXT:    srli a0, a3, 40
-; RV64I-NEXT:    sb a0, 21(a2)
-; RV64I-NEXT:    srli a0, a3, 32
-; RV64I-NEXT:    sb a0, 20(a2)
-; RV64I-NEXT:    srli a0, a3, 24
-; RV64I-NEXT:    sb a0, 19(a2)
-; RV64I-NEXT:    srli a0, a3, 16
-; RV64I-NEXT:    sb a0, 18(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 17(a2)
-; RV64I-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb a6, 16(a2)
+; RV64I-NEXT:    sb a4, 24(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 224
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a4, 25(a0)
-; RV32I-NEXT:    lbu t2, 26(a0)
-; RV32I-NEXT:    lbu t5, 27(a0)
-; RV32I-NEXT:    lbu a6, 28(a0)
-; RV32I-NEXT:    lbu t1, 29(a0)
-; RV32I-NEXT:    lbu t3, 30(a0)
-; RV32I-NEXT:    lbu t4, 31(a0)
-; RV32I-NEXT:    lbu t6, 16(a0)
-; RV32I-NEXT:    lbu t0, 17(a0)
-; RV32I-NEXT:    lbu s1, 18(a0)
-; RV32I-NEXT:    lbu s2, 19(a0)
-; RV32I-NEXT:    lbu s0, 20(a0)
-; RV32I-NEXT:    lbu s5, 21(a0)
-; RV32I-NEXT:    lbu s6, 22(a0)
-; RV32I-NEXT:    lbu s7, 23(a0)
-; RV32I-NEXT:    lbu a3, 9(a0)
-; RV32I-NEXT:    lbu a5, 8(a0)
-; RV32I-NEXT:    lbu s3, 10(a0)
-; RV32I-NEXT:    lbu s4, 11(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    slli s3, s3, 16
-; RV32I-NEXT:    slli s4, s4, 24
-; RV32I-NEXT:    or a3, s3, a3
-; RV32I-NEXT:    or a3, s4, a3
-; RV32I-NEXT:    lbu a5, 13(a0)
-; RV32I-NEXT:    lbu s3, 12(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s8, 15(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s3
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    or a5, s4, a5
-; RV32I-NEXT:    or a5, s8, a5
-; RV32I-NEXT:    sw a5, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a5, 1(a0)
-; RV32I-NEXT:    lbu s3, 0(a0)
-; RV32I-NEXT:    lbu s4, 2(a0)
-; RV32I-NEXT:    lbu s8, 3(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s3
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    or a5, s4, a5
-; RV32I-NEXT:    or s4, s8, a5
-; RV32I-NEXT:    lbu a5, 5(a0)
-; RV32I-NEXT:    lbu s3, 4(a0)
-; RV32I-NEXT:    lbu s8, 6(a0)
-; RV32I-NEXT:    lbu a0, 7(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s3
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a5, s8, a5
-; RV32I-NEXT:    or s8, a0, a5
-; RV32I-NEXT:    lbu a0, 1(a1)
-; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu s3, 2(a1)
+; RV32I-NEXT:    addi sp, sp, -144
+; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 2(a0)
+; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 3(a0)
+; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 5(a0)
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    lbu s8, 1(a1)
+; RV32I-NEXT:    lbu s9, 20(a0)
+; RV32I-NEXT:    lbu s10, 0(a1)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    lbu ra, 2(a1)
+; RV32I-NEXT:    or s8, s8, s10
+; RV32I-NEXT:    lbu s10, 22(a0)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    slli s3, s3, 16
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    or s8, ra, s8
+; RV32I-NEXT:    lbu ra, 23(a0)
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a5, s3, a0
-; RV32I-NEXT:    or a5, a1, a5
-; RV32I-NEXT:    sll a1, s8, a5
-; RV32I-NEXT:    not a0, a5
-; RV32I-NEXT:    srli s3, s4, 1
-; RV32I-NEXT:    srl s3, s3, a0
-; RV32I-NEXT:    or s10, a1, s3
-; RV32I-NEXT:    addi s3, a5, -224
-; RV32I-NEXT:    sw s4, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sll a1, s4, a5
-; RV32I-NEXT:    sw s10, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz s3, .LBB10_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv s10, a1
-; RV32I-NEXT:  .LBB10_2:
-; RV32I-NEXT:    sw a1, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    slli s3, t0, 8
-; RV32I-NEXT:    slli s11, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    slli ra, s5, 8
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s7, s7, 24
-; RV32I-NEXT:    lw a1, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sll a1, a1, a5
-; RV32I-NEXT:    srli t0, a3, 1
-; RV32I-NEXT:    srl s1, t0, a0
-; RV32I-NEXT:    or s5, a1, s1
-; RV32I-NEXT:    addi s9, a5, -160
-; RV32I-NEXT:    sll a1, a3, a5
-; RV32I-NEXT:    sw s5, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz s9, .LBB10_4
-; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv s5, a1
-; RV32I-NEXT:  .LBB10_4:
-; RV32I-NEXT:    sw a1, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a1, t2, 16
-; RV32I-NEXT:    slli t5, t5, 24
-; RV32I-NEXT:    or s1, s3, t6
-; RV32I-NEXT:    or s2, s2, s11
-; RV32I-NEXT:    or s0, ra, s0
-; RV32I-NEXT:    or s3, s7, s6
-; RV32I-NEXT:    neg s7, a5
-; RV32I-NEXT:    srl ra, s8, s7
-; RV32I-NEXT:    li s6, 160
-; RV32I-NEXT:    addi t6, a5, -128
-; RV32I-NEXT:    li t2, 64
-; RV32I-NEXT:    sub s6, s6, a5
-; RV32I-NEXT:    sw s6, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgeu t6, t2, .LBB10_6
-; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    slti s6, s6, 0
-; RV32I-NEXT:    neg s6, s6
-; RV32I-NEXT:    and s6, s6, ra
-; RV32I-NEXT:    or s10, s5, s6
-; RV32I-NEXT:  .LBB10_6:
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a4, a4, a7
-; RV32I-NEXT:    or s5, t5, a1
-; RV32I-NEXT:    or t5, s2, s1
-; RV32I-NEXT:    or s2, s3, s0
-; RV32I-NEXT:    lw s3, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    mv a1, s3
-; RV32I-NEXT:    beqz t6, .LBB10_8
-; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv a1, s10
-; RV32I-NEXT:  .LBB10_8:
-; RV32I-NEXT:    or a7, t1, a6
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    or a6, s5, a4
-; RV32I-NEXT:    sll a4, s2, a5
-; RV32I-NEXT:    srli t1, t5, 1
-; RV32I-NEXT:    srl t1, t1, a0
-; RV32I-NEXT:    or t1, a4, t1
-; RV32I-NEXT:    addi t4, a5, -96
-; RV32I-NEXT:    sll a4, t5, a5
-; RV32I-NEXT:    sw a4, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv a4, t1
-; RV32I-NEXT:    sw t4, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz t4, .LBB10_10
-; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    lw a4, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB10_10:
-; RV32I-NEXT:    or a7, t3, a7
-; RV32I-NEXT:    addi s4, a5, -32
-; RV32I-NEXT:    sll t3, a6, a5
-; RV32I-NEXT:    sw t3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgez s4, .LBB10_12
-; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sll t3, a7, a5
-; RV32I-NEXT:    srli s0, a6, 1
-; RV32I-NEXT:    srl a0, s0, a0
-; RV32I-NEXT:    or t3, t3, a0
-; RV32I-NEXT:  .LBB10_12:
-; RV32I-NEXT:    srl s5, s2, s7
-; RV32I-NEXT:    li a0, 32
-; RV32I-NEXT:    sub s0, a0, a5
-; RV32I-NEXT:    sw s0, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    slti s1, s0, 0
-; RV32I-NEXT:    neg s6, s1
-; RV32I-NEXT:    bgeu a5, t2, .LBB10_14
-; RV32I-NEXT:  # %bb.13:
-; RV32I-NEXT:    and a4, s6, s5
-; RV32I-NEXT:    or a4, t3, a4
-; RV32I-NEXT:  .LBB10_14:
-; RV32I-NEXT:    sw s5, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv t3, a7
-; RV32I-NEXT:    beqz a5, .LBB10_16
-; RV32I-NEXT:  # %bb.15:
-; RV32I-NEXT:    mv t3, a4
-; RV32I-NEXT:  .LBB10_16:
-; RV32I-NEXT:    srl s10, s3, s7
-; RV32I-NEXT:    li a4, 96
-; RV32I-NEXT:    sub s5, a4, a5
-; RV32I-NEXT:    slti a4, s5, 0
-; RV32I-NEXT:    neg a4, a4
-; RV32I-NEXT:    li t2, 128
-; RV32I-NEXT:    sub s11, t2, a5
-; RV32I-NEXT:    sltiu s1, s11, 64
-; RV32I-NEXT:    neg s1, s1
-; RV32I-NEXT:    bgeu a5, t2, .LBB10_18
-; RV32I-NEXT:  # %bb.17:
-; RV32I-NEXT:    and a1, a4, s10
-; RV32I-NEXT:    and a1, s1, a1
-; RV32I-NEXT:    or a1, t3, a1
-; RV32I-NEXT:  .LBB10_18:
-; RV32I-NEXT:    li t2, 64
-; RV32I-NEXT:    sw s1, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    beqz a5, .LBB10_20
-; RV32I-NEXT:  # %bb.19:
-; RV32I-NEXT:    mv a7, a1
-; RV32I-NEXT:  .LBB10_20:
-; RV32I-NEXT:    neg s1, s11
-; RV32I-NEXT:    sub t3, a0, s11
-; RV32I-NEXT:    sll a1, a3, s1
-; RV32I-NEXT:    bltz t3, .LBB10_23
-; RV32I-NEXT:  # %bb.21:
-; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    bgeu s11, t2, .LBB10_24
-; RV32I-NEXT:  .LBB10_22:
-; RV32I-NEXT:    and a4, a4, ra
-; RV32I-NEXT:    or a4, a4, a0
-; RV32I-NEXT:    mv a0, s8
-; RV32I-NEXT:    bnez s11, .LBB10_25
-; RV32I-NEXT:    j .LBB10_26
-; RV32I-NEXT:  .LBB10_23:
-; RV32I-NEXT:    sll a0, s3, s1
-; RV32I-NEXT:    sub s1, t2, s11
-; RV32I-NEXT:    not s1, s1
-; RV32I-NEXT:    srl t0, t0, s1
-; RV32I-NEXT:    or a0, a0, t0
-; RV32I-NEXT:    bltu s11, t2, .LBB10_22
-; RV32I-NEXT:  .LBB10_24:
-; RV32I-NEXT:    and a4, s6, s10
-; RV32I-NEXT:    mv a0, s8
-; RV32I-NEXT:    beqz s11, .LBB10_26
-; RV32I-NEXT:  .LBB10_25:
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:  .LBB10_26:
-; RV32I-NEXT:    bltz s4, .LBB10_28
-; RV32I-NEXT:  # %bb.27:
-; RV32I-NEXT:    lw t1, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB10_28:
-; RV32I-NEXT:    sw s6, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sltiu t0, a5, 64
-; RV32I-NEXT:    lw a4, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    mv s0, s9
-; RV32I-NEXT:    bltz s9, .LBB10_30
-; RV32I-NEXT:  # %bb.29:
-; RV32I-NEXT:    lw a4, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB10_30:
-; RV32I-NEXT:    neg s9, t0
-; RV32I-NEXT:    sltiu t0, t6, 64
-; RV32I-NEXT:    neg s1, t0
-; RV32I-NEXT:    li t0, 128
-; RV32I-NEXT:    bltu a5, t0, .LBB10_32
-; RV32I-NEXT:  # %bb.31:
-; RV32I-NEXT:    and a4, s1, a4
-; RV32I-NEXT:    mv t0, s2
-; RV32I-NEXT:    bnez a5, .LBB10_33
-; RV32I-NEXT:    j .LBB10_34
-; RV32I-NEXT:  .LBB10_32:
-; RV32I-NEXT:    and a4, s9, t1
-; RV32I-NEXT:    or a4, a4, a0
-; RV32I-NEXT:    mv t0, s2
-; RV32I-NEXT:    beqz a5, .LBB10_34
-; RV32I-NEXT:  .LBB10_33:
-; RV32I-NEXT:    mv t0, a4
-; RV32I-NEXT:  .LBB10_34:
-; RV32I-NEXT:    srl t1, a3, s7
-; RV32I-NEXT:    slli a4, s3, 1
-; RV32I-NEXT:    sub a0, t2, a5
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    lw s3, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz s3, .LBB10_36
-; RV32I-NEXT:  # %bb.35:
-; RV32I-NEXT:    mv s3, s10
-; RV32I-NEXT:    j .LBB10_37
-; RV32I-NEXT:  .LBB10_36:
-; RV32I-NEXT:    sll a0, a4, a0
-; RV32I-NEXT:    or s3, t1, a0
-; RV32I-NEXT:  .LBB10_37:
-; RV32I-NEXT:    lw a0, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    srl s10, a0, s7
-; RV32I-NEXT:    slli s8, s8, 1
-; RV32I-NEXT:    not a0, s11
-; RV32I-NEXT:    bltz s5, .LBB10_39
-; RV32I-NEXT:  # %bb.38:
-; RV32I-NEXT:    mv t4, s8
-; RV32I-NEXT:    mv s6, s10
-; RV32I-NEXT:    mv s8, ra
-; RV32I-NEXT:    bltu s11, t2, .LBB10_40
-; RV32I-NEXT:    j .LBB10_41
-; RV32I-NEXT:  .LBB10_39:
-; RV32I-NEXT:    mv t4, s8
-; RV32I-NEXT:    sll s8, s8, a0
-; RV32I-NEXT:    mv s6, s10
-; RV32I-NEXT:    or s8, s10, s8
-; RV32I-NEXT:    bgeu s11, t2, .LBB10_41
-; RV32I-NEXT:  .LBB10_40:
-; RV32I-NEXT:    slti t3, t3, 0
-; RV32I-NEXT:    neg t3, t3
-; RV32I-NEXT:    and a1, t3, a1
-; RV32I-NEXT:    or s3, s8, a1
-; RV32I-NEXT:  .LBB10_41:
-; RV32I-NEXT:    beqz s11, .LBB10_43
-; RV32I-NEXT:  # %bb.42:
-; RV32I-NEXT:    sw s3, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:  .LBB10_43:
-; RV32I-NEXT:    mv s11, s4
-; RV32I-NEXT:    slti a1, s4, 0
-; RV32I-NEXT:    neg s8, a1
-; RV32I-NEXT:    slti a1, s0, 0
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    li t3, 128
-; RV32I-NEXT:    bltu a5, t3, .LBB10_45
-; RV32I-NEXT:  # %bb.44:
-; RV32I-NEXT:    lw s4, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and t3, a1, s4
-; RV32I-NEXT:    and t3, s1, t3
-; RV32I-NEXT:    mv s3, t5
-; RV32I-NEXT:    bnez a5, .LBB10_46
-; RV32I-NEXT:    j .LBB10_47
-; RV32I-NEXT:  .LBB10_45:
-; RV32I-NEXT:    lw t3, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and t3, s8, t3
-; RV32I-NEXT:    and t3, s9, t3
-; RV32I-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or t3, t3, s0
-; RV32I-NEXT:    lw s4, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    mv s3, t5
-; RV32I-NEXT:    beqz a5, .LBB10_47
-; RV32I-NEXT:  .LBB10_46:
-; RV32I-NEXT:    mv s3, t3
-; RV32I-NEXT:  .LBB10_47:
-; RV32I-NEXT:    lw t3, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgez t3, .LBB10_49
-; RV32I-NEXT:  # %bb.48:
-; RV32I-NEXT:    srl t3, t5, s7
-; RV32I-NEXT:    slli s2, s2, 1
-; RV32I-NEXT:    lw t5, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sll t5, s2, t5
-; RV32I-NEXT:    or t3, t3, t5
-; RV32I-NEXT:    sw t3, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:  .LBB10_49:
-; RV32I-NEXT:    lw s2, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slti t3, s2, 0
-; RV32I-NEXT:    neg t5, t3
-; RV32I-NEXT:    bltu a5, t2, .LBB10_51
-; RV32I-NEXT:  # %bb.50:
-; RV32I-NEXT:    lw t3, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and s1, t5, t3
-; RV32I-NEXT:    mv t3, a6
-; RV32I-NEXT:    bnez a5, .LBB10_52
-; RV32I-NEXT:    j .LBB10_53
-; RV32I-NEXT:  .LBB10_51:
-; RV32I-NEXT:    lw t3, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and t3, s8, t3
-; RV32I-NEXT:    lw s1, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or s1, t3, s1
-; RV32I-NEXT:    mv t3, a6
-; RV32I-NEXT:    beqz a5, .LBB10_53
-; RV32I-NEXT:  .LBB10_52:
-; RV32I-NEXT:    mv t3, s1
-; RV32I-NEXT:  .LBB10_53:
-; RV32I-NEXT:    bgez s5, .LBB10_55
-; RV32I-NEXT:  # %bb.54:
-; RV32I-NEXT:    sll a0, a4, a0
-; RV32I-NEXT:    or a0, t1, a0
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:  .LBB10_55:
-; RV32I-NEXT:    lw a4, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw t1, 32(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw a0, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a0, .LBB10_58
-; RV32I-NEXT:  # %bb.56:
-; RV32I-NEXT:    mv a0, ra
-; RV32I-NEXT:    bgeu t6, t2, .LBB10_59
-; RV32I-NEXT:  .LBB10_57:
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    or a1, a1, a0
-; RV32I-NEXT:    mv a0, a3
-; RV32I-NEXT:    bnez t6, .LBB10_60
-; RV32I-NEXT:    j .LBB10_61
-; RV32I-NEXT:  .LBB10_58:
-; RV32I-NEXT:    li a0, 192
-; RV32I-NEXT:    sub a0, a0, a5
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    sll a0, t4, a0
-; RV32I-NEXT:    or a0, s6, a0
-; RV32I-NEXT:    bltu t6, t2, .LBB10_57
-; RV32I-NEXT:  .LBB10_59:
+; RV32I-NEXT:    or t0, a1, s8
+; RV32I-NEXT:    lbu s8, 24(a0)
+; RV32I-NEXT:    lbu a7, 25(a0)
+; RV32I-NEXT:    lbu a6, 26(a0)
+; RV32I-NEXT:    lbu a5, 27(a0)
+; RV32I-NEXT:    lbu a1, 31(a0)
+; RV32I-NEXT:    lbu a3, 30(a0)
+; RV32I-NEXT:    lbu a4, 29(a0)
+; RV32I-NEXT:    lbu a0, 28(a0)
+; RV32I-NEXT:    sb a1, 91(sp)
+; RV32I-NEXT:    sb a3, 90(sp)
+; RV32I-NEXT:    sb a4, 89(sp)
+; RV32I-NEXT:    sb a0, 88(sp)
+; RV32I-NEXT:    sb a5, 87(sp)
+; RV32I-NEXT:    sb a6, 86(sp)
+; RV32I-NEXT:    sb a7, 85(sp)
+; RV32I-NEXT:    sb s8, 84(sp)
+; RV32I-NEXT:    sb ra, 83(sp)
+; RV32I-NEXT:    sb s10, 82(sp)
+; RV32I-NEXT:    sb s11, 81(sp)
+; RV32I-NEXT:    sb s9, 80(sp)
+; RV32I-NEXT:    sb s7, 79(sp)
+; RV32I-NEXT:    sb s6, 78(sp)
+; RV32I-NEXT:    sb s5, 77(sp)
+; RV32I-NEXT:    sb s4, 76(sp)
+; RV32I-NEXT:    sb zero, 59(sp)
+; RV32I-NEXT:    sb zero, 58(sp)
+; RV32I-NEXT:    sb zero, 57(sp)
+; RV32I-NEXT:    sb zero, 56(sp)
+; RV32I-NEXT:    sb zero, 55(sp)
+; RV32I-NEXT:    sb zero, 54(sp)
+; RV32I-NEXT:    sb zero, 53(sp)
+; RV32I-NEXT:    sb zero, 52(sp)
+; RV32I-NEXT:    sb zero, 51(sp)
+; RV32I-NEXT:    sb zero, 50(sp)
+; RV32I-NEXT:    sb zero, 49(sp)
+; RV32I-NEXT:    sb zero, 48(sp)
+; RV32I-NEXT:    sb zero, 47(sp)
+; RV32I-NEXT:    sb zero, 46(sp)
+; RV32I-NEXT:    sb zero, 45(sp)
+; RV32I-NEXT:    sb zero, 44(sp)
+; RV32I-NEXT:    sb zero, 43(sp)
+; RV32I-NEXT:    sb zero, 42(sp)
+; RV32I-NEXT:    sb zero, 41(sp)
+; RV32I-NEXT:    sb zero, 40(sp)
+; RV32I-NEXT:    sb zero, 39(sp)
+; RV32I-NEXT:    sb zero, 38(sp)
+; RV32I-NEXT:    sb zero, 37(sp)
+; RV32I-NEXT:    sb zero, 36(sp)
+; RV32I-NEXT:    sb zero, 35(sp)
+; RV32I-NEXT:    sb zero, 34(sp)
+; RV32I-NEXT:    sb zero, 33(sp)
+; RV32I-NEXT:    sb zero, 32(sp)
+; RV32I-NEXT:    sb zero, 31(sp)
+; RV32I-NEXT:    sb zero, 30(sp)
+; RV32I-NEXT:    sb zero, 29(sp)
+; RV32I-NEXT:    sb zero, 28(sp)
+; RV32I-NEXT:    sb s3, 75(sp)
+; RV32I-NEXT:    sb s2, 74(sp)
+; RV32I-NEXT:    sb s1, 73(sp)
+; RV32I-NEXT:    sb s0, 72(sp)
+; RV32I-NEXT:    sb t6, 71(sp)
+; RV32I-NEXT:    sb t5, 70(sp)
+; RV32I-NEXT:    sb t4, 69(sp)
+; RV32I-NEXT:    sb t3, 68(sp)
+; RV32I-NEXT:    sb t2, 67(sp)
+; RV32I-NEXT:    sb t1, 66(sp)
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 65(sp)
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 64(sp)
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 63(sp)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slti a0, a0, 0
-; RV32I-NEXT:    neg a0, a0
-; RV32I-NEXT:    and a1, a0, s4
-; RV32I-NEXT:    mv a0, a3
-; RV32I-NEXT:    beqz t6, .LBB10_61
-; RV32I-NEXT:  .LBB10_60:
-; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:  .LBB10_61:
-; RV32I-NEXT:    li a1, 128
-; RV32I-NEXT:    bltu a5, a1, .LBB10_70
-; RV32I-NEXT:  # %bb.62:
-; RV32I-NEXT:    lw t3, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bnez a5, .LBB10_71
-; RV32I-NEXT:  .LBB10_63:
-; RV32I-NEXT:    mv a0, t3
-; RV32I-NEXT:    bgez s2, .LBB10_72
-; RV32I-NEXT:  .LBB10_64:
-; RV32I-NEXT:    bgez s11, .LBB10_73
-; RV32I-NEXT:  .LBB10_65:
-; RV32I-NEXT:    bltu a5, t2, .LBB10_74
-; RV32I-NEXT:  .LBB10_66:
-; RV32I-NEXT:    bnez a5, .LBB10_75
-; RV32I-NEXT:  .LBB10_67:
-; RV32I-NEXT:    lw a0, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a0, .LBB10_76
-; RV32I-NEXT:  .LBB10_68:
-; RV32I-NEXT:    sltiu a0, a5, 128
-; RV32I-NEXT:    bltu a5, t2, .LBB10_77
-; RV32I-NEXT:  .LBB10_69:
-; RV32I-NEXT:    and a1, t5, s4
-; RV32I-NEXT:    neg a4, a0
-; RV32I-NEXT:    bnez a5, .LBB10_78
-; RV32I-NEXT:    j .LBB10_79
-; RV32I-NEXT:  .LBB10_70:
+; RV32I-NEXT:    sb a0, 62(sp)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a0, t1, a0
-; RV32I-NEXT:    or a0, t3, a0
-; RV32I-NEXT:    lw t3, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    beqz a5, .LBB10_63
-; RV32I-NEXT:  .LBB10_71:
-; RV32I-NEXT:    mv a6, a0
-; RV32I-NEXT:    mv a0, t3
-; RV32I-NEXT:    bltz s2, .LBB10_64
-; RV32I-NEXT:  .LBB10_72:
-; RV32I-NEXT:    mv a0, s4
-; RV32I-NEXT:    bltz s11, .LBB10_65
-; RV32I-NEXT:  .LBB10_73:
-; RV32I-NEXT:    sw a4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgeu a5, t2, .LBB10_66
-; RV32I-NEXT:  .LBB10_74:
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a0, a0, ra
-; RV32I-NEXT:    lw a1, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    beqz a5, .LBB10_67
-; RV32I-NEXT:  .LBB10_75:
-; RV32I-NEXT:    sw a0, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgez a0, .LBB10_68
-; RV32I-NEXT:  .LBB10_76:
+; RV32I-NEXT:    sb a0, 61(sp)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sll a0, t4, a0
-; RV32I-NEXT:    or ra, s6, a0
-; RV32I-NEXT:    sltiu a0, a5, 128
-; RV32I-NEXT:    bgeu a5, t2, .LBB10_69
-; RV32I-NEXT:  .LBB10_77:
-; RV32I-NEXT:    and a1, s8, a4
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    neg a4, a0
-; RV32I-NEXT:    beqz a5, .LBB10_79
-; RV32I-NEXT:  .LBB10_78:
-; RV32I-NEXT:    mv a3, a1
-; RV32I-NEXT:  .LBB10_79:
-; RV32I-NEXT:    lw a1, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a1, a4, a1
-; RV32I-NEXT:    and a0, a4, a3
-; RV32I-NEXT:    bltz s11, .LBB10_81
-; RV32I-NEXT:  # %bb.80:
-; RV32I-NEXT:    mv t3, s4
-; RV32I-NEXT:  .LBB10_81:
-; RV32I-NEXT:    and a3, a4, t3
-; RV32I-NEXT:    and a3, a3, s9
-; RV32I-NEXT:    and a5, s8, s4
-; RV32I-NEXT:    and a4, a4, a5
-; RV32I-NEXT:    and a4, a4, s9
-; RV32I-NEXT:    sb a4, 0(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    srli a5, a4, 24
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 2(a2)
+; RV32I-NEXT:    sb a0, 60(sp)
+; RV32I-NEXT:    slli a0, t0, 24
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    addi a5, sp, 60
+; RV32I-NEXT:    sub a5, a5, a0
+; RV32I-NEXT:    lbu a0, 5(a5)
+; RV32I-NEXT:    lbu a1, 4(a5)
+; RV32I-NEXT:    lbu a3, 6(a5)
+; RV32I-NEXT:    lbu a4, 7(a5)
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    slli a4, a4, 24
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    or t4, a4, a0
+; RV32I-NEXT:    andi a1, t0, 7
+; RV32I-NEXT:    lbu a0, 1(a5)
+; RV32I-NEXT:    lbu a3, 0(a5)
+; RV32I-NEXT:    lbu a4, 2(a5)
+; RV32I-NEXT:    lbu a6, 3(a5)
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    or a6, a6, a0
+; RV32I-NEXT:    srli a0, a6, 1
+; RV32I-NEXT:    xori t0, a1, 31
+; RV32I-NEXT:    srl a0, a0, t0
+; RV32I-NEXT:    lbu a3, 13(a5)
+; RV32I-NEXT:    lbu a4, 12(a5)
+; RV32I-NEXT:    lbu a7, 14(a5)
+; RV32I-NEXT:    lbu t1, 15(a5)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    or t1, t1, a3
+; RV32I-NEXT:    lbu a3, 9(a5)
+; RV32I-NEXT:    lbu a4, 8(a5)
+; RV32I-NEXT:    lbu a7, 10(a5)
+; RV32I-NEXT:    lbu t2, 11(a5)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    or t2, t2, a3
+; RV32I-NEXT:    srli a3, t2, 1
+; RV32I-NEXT:    srl a3, a3, t0
+; RV32I-NEXT:    srli a4, t4, 1
+; RV32I-NEXT:    not t3, a1
+; RV32I-NEXT:    srl a7, a4, t3
+; RV32I-NEXT:    lbu a4, 21(a5)
+; RV32I-NEXT:    lbu t5, 20(a5)
+; RV32I-NEXT:    lbu t6, 22(a5)
+; RV32I-NEXT:    lbu s0, 23(a5)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, t5
+; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    or a4, t6, a4
+; RV32I-NEXT:    or a4, s0, a4
+; RV32I-NEXT:    lbu t5, 17(a5)
+; RV32I-NEXT:    lbu t6, 16(a5)
+; RV32I-NEXT:    lbu s0, 18(a5)
+; RV32I-NEXT:    lbu s1, 19(a5)
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    or t5, t5, t6
+; RV32I-NEXT:    slli s0, s0, 16
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or t5, s0, t5
+; RV32I-NEXT:    or t5, s1, t5
+; RV32I-NEXT:    lbu t6, 29(a5)
+; RV32I-NEXT:    lbu s0, 28(a5)
+; RV32I-NEXT:    lbu s1, 30(a5)
+; RV32I-NEXT:    lbu s2, 31(a5)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or t6, t6, s0
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    lbu s0, 25(a5)
+; RV32I-NEXT:    or t6, s1, t6
+; RV32I-NEXT:    lbu s1, 24(a5)
+; RV32I-NEXT:    or t6, s2, t6
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    lbu s2, 26(a5)
+; RV32I-NEXT:    or s0, s0, s1
+; RV32I-NEXT:    srli s1, t5, 1
+; RV32I-NEXT:    srl s1, s1, t0
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    lbu a5, 27(a5)
+; RV32I-NEXT:    or s0, s2, s0
+; RV32I-NEXT:    srli s2, t1, 1
+; RV32I-NEXT:    srl s2, s2, t3
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a5, a5, s0
+; RV32I-NEXT:    srli s0, a5, 1
+; RV32I-NEXT:    srl t0, s0, t0
+; RV32I-NEXT:    srli s0, a4, 1
+; RV32I-NEXT:    srl t3, s0, t3
+; RV32I-NEXT:    sll t4, t4, a1
+; RV32I-NEXT:    sll t1, t1, a1
+; RV32I-NEXT:    sll t2, t2, a1
+; RV32I-NEXT:    sll a4, a4, a1
+; RV32I-NEXT:    sll t5, t5, a1
+; RV32I-NEXT:    sll t6, t6, a1
+; RV32I-NEXT:    sll a5, a5, a1
+; RV32I-NEXT:    sll a1, a6, a1
+; RV32I-NEXT:    srli a6, a5, 24
+; RV32I-NEXT:    sb a6, 27(a2)
+; RV32I-NEXT:    srli a6, a5, 16
+; RV32I-NEXT:    sb a6, 26(a2)
+; RV32I-NEXT:    or a6, a5, t3
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 25(a2)
+; RV32I-NEXT:    srli a5, t6, 24
+; RV32I-NEXT:    sb a5, 31(a2)
+; RV32I-NEXT:    srli a5, t6, 16
+; RV32I-NEXT:    sb a5, 30(a2)
+; RV32I-NEXT:    or a5, t6, t0
+; RV32I-NEXT:    srli t0, t6, 8
+; RV32I-NEXT:    sb t0, 29(a2)
+; RV32I-NEXT:    srli t0, t5, 24
+; RV32I-NEXT:    sb t0, 19(a2)
+; RV32I-NEXT:    srli t0, t5, 16
+; RV32I-NEXT:    sb t0, 18(a2)
+; RV32I-NEXT:    or t0, t5, s2
+; RV32I-NEXT:    srli t3, t5, 8
+; RV32I-NEXT:    sb t3, 17(a2)
+; RV32I-NEXT:    srli t3, a4, 24
+; RV32I-NEXT:    sb t3, 23(a2)
+; RV32I-NEXT:    srli t3, a4, 16
+; RV32I-NEXT:    sb t3, 22(a2)
+; RV32I-NEXT:    or s1, a4, s1
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    srli a4, a3, 16
-; RV32I-NEXT:    sb a4, 6(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb a1, 12(a2)
-; RV32I-NEXT:    sb a0, 8(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, t2, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    or a4, t2, a7
+; RV32I-NEXT:    srli a7, t2, 8
+; RV32I-NEXT:    sb a7, 9(a2)
+; RV32I-NEXT:    srli a7, t1, 24
+; RV32I-NEXT:    sb a7, 15(a2)
+; RV32I-NEXT:    srli a7, t1, 16
+; RV32I-NEXT:    sb a7, 14(a2)
+; RV32I-NEXT:    or a3, t1, a3
+; RV32I-NEXT:    srli a7, t1, 8
+; RV32I-NEXT:    sb a7, 13(a2)
+; RV32I-NEXT:    srli a7, a1, 24
+; RV32I-NEXT:    sb a7, 3(a2)
+; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 13(a2)
-; RV32I-NEXT:    sb a7, 28(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    sb a1, 11(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    sb a1, 10(a2)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, t4, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, t4, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    or a0, t4, a0
+; RV32I-NEXT:    srli a1, t4, 8
+; RV32I-NEXT:    sb a1, 5(a2)
 ; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    srli a0, a7, 24
-; RV32I-NEXT:    sb a0, 31(a2)
-; RV32I-NEXT:    srli a0, a7, 16
-; RV32I-NEXT:    sb a0, 30(a2)
-; RV32I-NEXT:    srli a0, a7, 8
-; RV32I-NEXT:    sb a0, 29(a2)
-; RV32I-NEXT:    sb s3, 16(a2)
-; RV32I-NEXT:    srli a0, a6, 24
-; RV32I-NEXT:    sb a0, 27(a2)
-; RV32I-NEXT:    srli a0, a6, 16
-; RV32I-NEXT:    sb a0, 26(a2)
-; RV32I-NEXT:    srli a0, a6, 8
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    srli a0, s3, 24
-; RV32I-NEXT:    sb a0, 19(a2)
-; RV32I-NEXT:    srli a0, s3, 16
-; RV32I-NEXT:    sb a0, 18(a2)
-; RV32I-NEXT:    srli a0, s3, 8
-; RV32I-NEXT:    sb a0, 17(a2)
-; RV32I-NEXT:    sb t0, 20(a2)
-; RV32I-NEXT:    srli a0, t0, 24
-; RV32I-NEXT:    sb a0, 23(a2)
-; RV32I-NEXT:    srli a0, t0, 16
-; RV32I-NEXT:    sb a0, 22(a2)
-; RV32I-NEXT:    srli a0, t0, 8
-; RV32I-NEXT:    sb a0, 21(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    sb a5, 28(a2)
+; RV32I-NEXT:    sb t0, 16(a2)
+; RV32I-NEXT:    sb s1, 20(a2)
+; RV32I-NEXT:    sb a4, 8(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 144
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -3114,893 +2766,669 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd s0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
-; RV64I-NEXT:    lbu a5, 10(a0)
-; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, t0, a4
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a3, a3, a6
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a6, 2(a0)
-; RV64I-NEXT:    lbu a7, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or t0, a7, a4
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    addi sp, sp, -224
+; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t0, 31(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 2(a0)
+; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 3(a0)
+; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 4(a0)
+; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 5(a0)
+; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t2, 6(a0)
+; RV64I-NEXT:    lbu t3, 7(a0)
+; RV64I-NEXT:    lbu t4, 8(a0)
+; RV64I-NEXT:    lbu t5, 9(a0)
+; RV64I-NEXT:    lbu t6, 10(a0)
+; RV64I-NEXT:    lbu s0, 11(a0)
+; RV64I-NEXT:    lbu s1, 12(a0)
+; RV64I-NEXT:    lbu s2, 13(a0)
+; RV64I-NEXT:    lbu s3, 14(a0)
+; RV64I-NEXT:    lbu s4, 15(a0)
+; RV64I-NEXT:    lbu s5, 16(a0)
+; RV64I-NEXT:    lbu s6, 17(a0)
+; RV64I-NEXT:    lbu s7, 18(a0)
+; RV64I-NEXT:    lbu s8, 1(a1)
+; RV64I-NEXT:    lbu s9, 0(a1)
+; RV64I-NEXT:    lbu s10, 2(a1)
+; RV64I-NEXT:    lbu s11, 19(a0)
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    or s8, s8, s9
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    lbu s9, 5(a1)
+; RV64I-NEXT:    lbu ra, 4(a1)
+; RV64I-NEXT:    or s8, s10, s8
+; RV64I-NEXT:    lbu s10, 6(a1)
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s9, s9, ra
+; RV64I-NEXT:    lbu ra, 7(a1)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    or s9, s10, s9
+; RV64I-NEXT:    lbu s10, 20(a0)
+; RV64I-NEXT:    slli ra, ra, 24
+; RV64I-NEXT:    or s9, ra, s9
+; RV64I-NEXT:    lbu ra, 21(a0)
+; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli s9, s9, 32
+; RV64I-NEXT:    or s8, s9, s8
+; RV64I-NEXT:    lbu s9, 22(a0)
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or t1, s8, a1
+; RV64I-NEXT:    lbu s8, 23(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a5, 26(a0)
+; RV64I-NEXT:    lbu a1, 30(a0)
+; RV64I-NEXT:    lbu a3, 29(a0)
+; RV64I-NEXT:    lbu a4, 28(a0)
+; RV64I-NEXT:    lbu a0, 27(a0)
+; RV64I-NEXT:    sb a1, 86(sp)
+; RV64I-NEXT:    sb a3, 85(sp)
+; RV64I-NEXT:    sb a4, 84(sp)
+; RV64I-NEXT:    sb a0, 83(sp)
+; RV64I-NEXT:    sb a5, 82(sp)
+; RV64I-NEXT:    sb a6, 81(sp)
+; RV64I-NEXT:    sb a7, 80(sp)
+; RV64I-NEXT:    sb s8, 79(sp)
+; RV64I-NEXT:    sb s9, 78(sp)
+; RV64I-NEXT:    sb ra, 77(sp)
+; RV64I-NEXT:    sb s10, 76(sp)
+; RV64I-NEXT:    sb s11, 75(sp)
+; RV64I-NEXT:    sb s7, 74(sp)
+; RV64I-NEXT:    sb s6, 73(sp)
+; RV64I-NEXT:    sb s5, 72(sp)
+; RV64I-NEXT:    sb s4, 71(sp)
+; RV64I-NEXT:    sb s3, 70(sp)
+; RV64I-NEXT:    sb s2, 69(sp)
+; RV64I-NEXT:    sb s1, 68(sp)
+; RV64I-NEXT:    sb s0, 67(sp)
+; RV64I-NEXT:    sb t6, 66(sp)
+; RV64I-NEXT:    sb t5, 65(sp)
+; RV64I-NEXT:    sb t0, 87(sp)
+; RV64I-NEXT:    slli t0, t0, 56
+; RV64I-NEXT:    sb t4, 64(sp)
+; RV64I-NEXT:    sb t3, 63(sp)
+; RV64I-NEXT:    sb t2, 62(sp)
+; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 61(sp)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 60(sp)
+; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 59(sp)
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 58(sp)
+; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 57(sp)
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 56(sp)
+; RV64I-NEXT:    srai a0, t0, 63
+; RV64I-NEXT:    sb a0, 112(sp)
+; RV64I-NEXT:    sb a0, 104(sp)
+; RV64I-NEXT:    sb a0, 96(sp)
+; RV64I-NEXT:    sb a0, 88(sp)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 119(sp)
+; RV64I-NEXT:    srli a3, a0, 48
+; RV64I-NEXT:    sb a3, 118(sp)
+; RV64I-NEXT:    srli a4, a0, 40
+; RV64I-NEXT:    sb a4, 117(sp)
+; RV64I-NEXT:    srli a5, a0, 32
+; RV64I-NEXT:    sb a5, 116(sp)
+; RV64I-NEXT:    srli a6, a0, 24
+; RV64I-NEXT:    sb a6, 115(sp)
+; RV64I-NEXT:    srli a7, a0, 16
+; RV64I-NEXT:    sb a7, 114(sp)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 113(sp)
+; RV64I-NEXT:    sb a1, 111(sp)
+; RV64I-NEXT:    sb a3, 110(sp)
+; RV64I-NEXT:    sb a4, 109(sp)
+; RV64I-NEXT:    sb a5, 108(sp)
+; RV64I-NEXT:    sb a6, 107(sp)
+; RV64I-NEXT:    sb a7, 106(sp)
+; RV64I-NEXT:    sb a0, 105(sp)
+; RV64I-NEXT:    sb a1, 103(sp)
+; RV64I-NEXT:    sb a3, 102(sp)
+; RV64I-NEXT:    sb a4, 101(sp)
+; RV64I-NEXT:    sb a5, 100(sp)
+; RV64I-NEXT:    sb a6, 99(sp)
+; RV64I-NEXT:    sb a7, 98(sp)
+; RV64I-NEXT:    sb a0, 97(sp)
+; RV64I-NEXT:    sb a1, 95(sp)
+; RV64I-NEXT:    sb a3, 94(sp)
+; RV64I-NEXT:    sb a4, 93(sp)
+; RV64I-NEXT:    sb a5, 92(sp)
+; RV64I-NEXT:    sb a6, 91(sp)
+; RV64I-NEXT:    sb a7, 90(sp)
+; RV64I-NEXT:    sb a0, 89(sp)
+; RV64I-NEXT:    slli a0, t1, 56
+; RV64I-NEXT:    srli a0, a0, 59
+; RV64I-NEXT:    addi a3, sp, 56
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    lbu a0, 9(a3)
+; RV64I-NEXT:    lbu a1, 8(a3)
+; RV64I-NEXT:    lbu a4, 10(a3)
+; RV64I-NEXT:    lbu a5, 11(a3)
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a0, a4, a0
+; RV64I-NEXT:    lbu a1, 13(a3)
+; RV64I-NEXT:    lbu a4, 12(a3)
+; RV64I-NEXT:    lbu a6, 14(a3)
+; RV64I-NEXT:    lbu a7, 15(a3)
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or t1, a7, a4
-; RV64I-NEXT:    slli t1, t1, 32
-; RV64I-NEXT:    lbu a4, 25(a0)
-; RV64I-NEXT:    lbu a5, 24(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a7, 27(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    or a1, a6, a1
+; RV64I-NEXT:    or a1, a7, a1
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    or a4, a0, a5
+; RV64I-NEXT:    andi a1, t1, 7
+; RV64I-NEXT:    lbu a0, 17(a3)
+; RV64I-NEXT:    lbu a5, 16(a3)
+; RV64I-NEXT:    lbu a6, 18(a3)
+; RV64I-NEXT:    lbu a7, 19(a3)
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    lbu a5, 29(a0)
-; RV64I-NEXT:    lbu a6, 28(a0)
-; RV64I-NEXT:    lbu t2, 30(a0)
-; RV64I-NEXT:    lbu t3, 31(a0)
+; RV64I-NEXT:    or a0, a6, a0
+; RV64I-NEXT:    lbu a5, 21(a3)
+; RV64I-NEXT:    lbu a6, 20(a3)
+; RV64I-NEXT:    lbu t0, 22(a3)
+; RV64I-NEXT:    lbu t1, 23(a3)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, t1, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a0, a5, a0
+; RV64I-NEXT:    or a5, a0, a7
+; RV64I-NEXT:    slli a0, a5, 1
+; RV64I-NEXT:    not a6, a1
+; RV64I-NEXT:    sll a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a3)
+; RV64I-NEXT:    lbu a7, 0(a3)
+; RV64I-NEXT:    lbu t0, 2(a3)
+; RV64I-NEXT:    lbu t1, 3(a3)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    lbu a7, 5(a3)
+; RV64I-NEXT:    lbu t0, 4(a3)
+; RV64I-NEXT:    lbu t2, 6(a3)
+; RV64I-NEXT:    lbu t3, 7(a3)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or a5, t2, a5
-; RV64I-NEXT:    or a5, t3, a5
-; RV64I-NEXT:    slli a6, a5, 32
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or a7, a4, a7
-; RV64I-NEXT:    lbu a4, 17(a0)
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu t2, 18(a0)
-; RV64I-NEXT:    lbu t3, 19(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
+; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    or a7, t3, a7
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 25(a3)
+; RV64I-NEXT:    lbu t0, 24(a3)
+; RV64I-NEXT:    lbu t2, 26(a3)
+; RV64I-NEXT:    or a6, a6, t1
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    lbu t0, 29(a3)
+; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    lbu t1, 28(a3)
+; RV64I-NEXT:    lbu t2, 30(a3)
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    lbu t3, 31(a3)
+; RV64I-NEXT:    or t0, t0, t1
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or a4, t2, a4
-; RV64I-NEXT:    lbu a6, 21(a0)
-; RV64I-NEXT:    lbu t2, 20(a0)
-; RV64I-NEXT:    lbu t4, 22(a0)
-; RV64I-NEXT:    lbu a0, 23(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, t2
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a6, t4, a6
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a4
-; RV64I-NEXT:    or t2, a0, t3
-; RV64I-NEXT:    lbu a0, 1(a1)
-; RV64I-NEXT:    lbu a4, 0(a1)
-; RV64I-NEXT:    lbu a6, 2(a1)
-; RV64I-NEXT:    lbu t3, 3(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a4
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or a0, a6, a0
-; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu t4, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a4, t4, a4
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a6, a0, t3
-; RV64I-NEXT:    srl a0, t2, a6
-; RV64I-NEXT:    not t5, a6
-; RV64I-NEXT:    slli a1, a7, 1
-; RV64I-NEXT:    sll a1, a1, t5
-; RV64I-NEXT:    or a1, a0, a1
-; RV64I-NEXT:    addi t3, a6, -192
-; RV64I-NEXT:    sra a4, a7, a6
-; RV64I-NEXT:    mv t6, a1
-; RV64I-NEXT:    bltz t3, .LBB11_2
-; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    mv t6, a4
-; RV64I-NEXT:  .LBB11_2:
-; RV64I-NEXT:    or a0, t1, t0
-; RV64I-NEXT:    addi t0, a6, -64
-; RV64I-NEXT:    srl t4, a3, a6
-; RV64I-NEXT:    bltz t0, .LBB11_4
-; RV64I-NEXT:  # %bb.3:
-; RV64I-NEXT:    mv s2, t4
-; RV64I-NEXT:    j .LBB11_5
-; RV64I-NEXT:  .LBB11_4:
-; RV64I-NEXT:    srl t1, a0, a6
-; RV64I-NEXT:    slli s0, a3, 1
-; RV64I-NEXT:    sll t5, s0, t5
-; RV64I-NEXT:    or s2, t1, t5
-; RV64I-NEXT:  .LBB11_5:
-; RV64I-NEXT:    negw s0, a6
-; RV64I-NEXT:    sll t5, t2, s0
-; RV64I-NEXT:    li s1, 64
-; RV64I-NEXT:    li t1, 128
-; RV64I-NEXT:    sub s1, s1, a6
-; RV64I-NEXT:    bltu a6, t1, .LBB11_18
-; RV64I-NEXT:  # %bb.6:
-; RV64I-NEXT:    bnez a6, .LBB11_19
-; RV64I-NEXT:  .LBB11_7:
-; RV64I-NEXT:    bgez s1, .LBB11_9
-; RV64I-NEXT:  .LBB11_8:
-; RV64I-NEXT:    sll a7, a7, s0
-; RV64I-NEXT:    srli t2, t2, 1
-; RV64I-NEXT:    subw t5, t1, a6
-; RV64I-NEXT:    not t5, t5
-; RV64I-NEXT:    srl t2, t2, t5
-; RV64I-NEXT:    or t5, a7, t2
-; RV64I-NEXT:  .LBB11_9:
-; RV64I-NEXT:    sraiw a5, a5, 31
-; RV64I-NEXT:    mv a7, a4
-; RV64I-NEXT:    bgez t3, .LBB11_20
-; RV64I-NEXT:  # %bb.10:
-; RV64I-NEXT:    bltu a6, t1, .LBB11_21
-; RV64I-NEXT:  .LBB11_11:
-; RV64I-NEXT:    bnez a6, .LBB11_22
-; RV64I-NEXT:  .LBB11_12:
-; RV64I-NEXT:    bgez t0, .LBB11_23
-; RV64I-NEXT:  .LBB11_13:
-; RV64I-NEXT:    bgeu a6, t1, .LBB11_24
-; RV64I-NEXT:  .LBB11_14:
-; RV64I-NEXT:    bgez t0, .LBB11_25
-; RV64I-NEXT:  .LBB11_15:
-; RV64I-NEXT:    bltu a6, t1, .LBB11_17
-; RV64I-NEXT:  .LBB11_16:
-; RV64I-NEXT:    mv a4, a5
-; RV64I-NEXT:  .LBB11_17:
-; RV64I-NEXT:    sb a4, 24(a2)
-; RV64I-NEXT:    srli a5, a4, 56
+; RV64I-NEXT:    or t0, t3, t0
+; RV64I-NEXT:    slli t1, a4, 1
+; RV64I-NEXT:    lbu a3, 27(a3)
+; RV64I-NEXT:    slli t0, t0, 32
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    xori t0, a1, 63
+; RV64I-NEXT:    sll t1, t1, t0
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    or a3, a7, a3
+; RV64I-NEXT:    slli a7, a3, 1
+; RV64I-NEXT:    sll a7, a7, t0
+; RV64I-NEXT:    srl a4, a4, a1
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    sra a1, a3, a1
+; RV64I-NEXT:    srli a3, a5, 48
+; RV64I-NEXT:    sb a3, 22(a2)
+; RV64I-NEXT:    srli a3, a5, 40
+; RV64I-NEXT:    sb a3, 21(a2)
+; RV64I-NEXT:    srli a3, a5, 32
+; RV64I-NEXT:    sb a3, 20(a2)
+; RV64I-NEXT:    srli a3, a5, 24
+; RV64I-NEXT:    sb a3, 19(a2)
+; RV64I-NEXT:    srli a3, a5, 16
+; RV64I-NEXT:    sb a3, 18(a2)
+; RV64I-NEXT:    or a3, a5, a7
+; RV64I-NEXT:    sb a5, 16(a2)
+; RV64I-NEXT:    srli a5, a5, 8
+; RV64I-NEXT:    sb a5, 17(a2)
+; RV64I-NEXT:    srli a5, a1, 56
 ; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a4, 48
+; RV64I-NEXT:    srli a5, a1, 48
 ; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    srli a5, a4, 40
+; RV64I-NEXT:    srli a5, a1, 40
 ; RV64I-NEXT:    sb a5, 29(a2)
-; RV64I-NEXT:    srli a5, a4, 32
+; RV64I-NEXT:    srli a5, a1, 32
 ; RV64I-NEXT:    sb a5, 28(a2)
-; RV64I-NEXT:    srli a5, a4, 24
+; RV64I-NEXT:    srli a5, a1, 24
 ; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, a4, 16
+; RV64I-NEXT:    srli a5, a1, 16
 ; RV64I-NEXT:    sb a5, 26(a2)
-; RV64I-NEXT:    srli a4, a4, 8
-; RV64I-NEXT:    sb a4, 25(a2)
-; RV64I-NEXT:    sb a1, 16(a2)
-; RV64I-NEXT:    srli a4, a1, 56
-; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    sb a4, 22(a2)
-; RV64I-NEXT:    srli a4, a1, 40
-; RV64I-NEXT:    sb a4, 21(a2)
-; RV64I-NEXT:    srli a4, a1, 32
-; RV64I-NEXT:    sb a4, 20(a2)
-; RV64I-NEXT:    srli a4, a1, 24
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    sb a4, 18(a2)
+; RV64I-NEXT:    sb a1, 24(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 17(a2)
-; RV64I-NEXT:    sb a0, 0(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a6, 48
 ; RV64I-NEXT:    sb a1, 6(a2)
-; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    srli a1, a6, 40
 ; RV64I-NEXT:    sb a1, 5(a2)
-; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    srli a1, a6, 32
 ; RV64I-NEXT:    sb a1, 4(a2)
-; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    srli a1, a6, 24
 ; RV64I-NEXT:    sb a1, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    srli a1, a6, 16
 ; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    sb a0, 1(a2)
-; RV64I-NEXT:    sb a3, 8(a2)
-; RV64I-NEXT:    srli a0, a3, 56
+; RV64I-NEXT:    or a1, a6, t1
+; RV64I-NEXT:    sb a6, 0(a2)
+; RV64I-NEXT:    srli a5, a6, 8
+; RV64I-NEXT:    sb a5, 1(a2)
+; RV64I-NEXT:    srli a5, a4, 48
+; RV64I-NEXT:    sb a5, 14(a2)
+; RV64I-NEXT:    srli a5, a4, 40
+; RV64I-NEXT:    sb a5, 13(a2)
+; RV64I-NEXT:    srli a5, a4, 32
+; RV64I-NEXT:    sb a5, 12(a2)
+; RV64I-NEXT:    srli a5, a4, 24
+; RV64I-NEXT:    sb a5, 11(a2)
+; RV64I-NEXT:    srli a5, a4, 16
+; RV64I-NEXT:    sb a5, 10(a2)
+; RV64I-NEXT:    or a0, a4, a0
+; RV64I-NEXT:    sb a4, 8(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 9(a2)
+; RV64I-NEXT:    srli a3, a3, 56
+; RV64I-NEXT:    sb a3, 23(a2)
+; RV64I-NEXT:    srli a1, a1, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    sb a0, 15(a2)
-; RV64I-NEXT:    srli a0, a3, 48
-; RV64I-NEXT:    sb a0, 14(a2)
-; RV64I-NEXT:    srli a0, a3, 40
-; RV64I-NEXT:    sb a0, 13(a2)
-; RV64I-NEXT:    srli a0, a3, 32
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    srli a0, a3, 24
-; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    srli a0, a3, 16
-; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    ld s0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 224
 ; RV64I-NEXT:    ret
-; RV64I-NEXT:  .LBB11_18:
-; RV64I-NEXT:    slti t6, s1, 0
-; RV64I-NEXT:    neg t6, t6
-; RV64I-NEXT:    and t6, t6, t5
-; RV64I-NEXT:    or t6, s2, t6
-; RV64I-NEXT:    beqz a6, .LBB11_7
-; RV64I-NEXT:  .LBB11_19:
-; RV64I-NEXT:    mv a0, t6
-; RV64I-NEXT:    bltz s1, .LBB11_8
-; RV64I-NEXT:    j .LBB11_9
-; RV64I-NEXT:  .LBB11_20:
-; RV64I-NEXT:    mv a7, a5
-; RV64I-NEXT:    bgeu a6, t1, .LBB11_11
-; RV64I-NEXT:  .LBB11_21:
-; RV64I-NEXT:    slti a7, t0, 0
-; RV64I-NEXT:    neg a7, a7
-; RV64I-NEXT:    and a7, a7, t4
-; RV64I-NEXT:    or a7, a7, t5
-; RV64I-NEXT:    beqz a6, .LBB11_12
-; RV64I-NEXT:  .LBB11_22:
-; RV64I-NEXT:    mv a3, a7
-; RV64I-NEXT:    bltz t0, .LBB11_13
-; RV64I-NEXT:  .LBB11_23:
-; RV64I-NEXT:    mv a1, a4
-; RV64I-NEXT:    bltu a6, t1, .LBB11_14
-; RV64I-NEXT:  .LBB11_24:
-; RV64I-NEXT:    mv a1, a5
-; RV64I-NEXT:    bltz t0, .LBB11_15
-; RV64I-NEXT:  .LBB11_25:
-; RV64I-NEXT:    mv a4, a5
-; RV64I-NEXT:    bgeu a6, t1, .LBB11_16
-; RV64I-NEXT:    j .LBB11_17
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -128
-; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a6, 4(a0)
-; RV32I-NEXT:    lbu t1, 5(a0)
-; RV32I-NEXT:    lbu t5, 6(a0)
-; RV32I-NEXT:    lbu t6, 7(a0)
+; RV32I-NEXT:    addi sp, sp, -144
+; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t3, 31(a0)
 ; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a5, 1(a0)
-; RV32I-NEXT:    lbu t2, 2(a0)
-; RV32I-NEXT:    lbu t3, 3(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu a4, 13(a0)
-; RV32I-NEXT:    lbu s3, 14(a0)
-; RV32I-NEXT:    lbu s4, 15(a0)
-; RV32I-NEXT:    lbu s2, 8(a0)
-; RV32I-NEXT:    lbu s1, 9(a0)
-; RV32I-NEXT:    lbu s7, 10(a0)
-; RV32I-NEXT:    lbu s8, 11(a0)
-; RV32I-NEXT:    lbu a3, 21(a0)
-; RV32I-NEXT:    lbu t0, 20(a0)
-; RV32I-NEXT:    lbu t4, 22(a0)
-; RV32I-NEXT:    lbu s5, 23(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t0
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    or a3, t4, a3
-; RV32I-NEXT:    or a3, s5, a3
-; RV32I-NEXT:    lbu t0, 17(a0)
-; RV32I-NEXT:    lbu t4, 16(a0)
-; RV32I-NEXT:    lbu s5, 18(a0)
-; RV32I-NEXT:    lbu s6, 19(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t4
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    or t0, s5, t0
-; RV32I-NEXT:    or t4, s6, t0
-; RV32I-NEXT:    lbu t0, 29(a0)
-; RV32I-NEXT:    lbu s5, 28(a0)
-; RV32I-NEXT:    lbu s6, 30(a0)
-; RV32I-NEXT:    lbu s9, 31(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, s5
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    or t0, s6, t0
-; RV32I-NEXT:    sw s9, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    or s10, s9, t0
-; RV32I-NEXT:    lbu t0, 25(a0)
-; RV32I-NEXT:    lbu s5, 24(a0)
-; RV32I-NEXT:    lbu s6, 26(a0)
-; RV32I-NEXT:    lbu a0, 27(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, s5
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or t0, s6, t0
-; RV32I-NEXT:    or s9, a0, t0
-; RV32I-NEXT:    lbu a0, 1(a1)
-; RV32I-NEXT:    lbu t0, 0(a1)
-; RV32I-NEXT:    lbu s5, 2(a1)
+; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 2(a0)
+; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 3(a0)
+; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 5(a0)
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t2, 6(a0)
+; RV32I-NEXT:    lbu t4, 7(a0)
+; RV32I-NEXT:    lbu t5, 8(a0)
+; RV32I-NEXT:    lbu t6, 9(a0)
+; RV32I-NEXT:    lbu s0, 10(a0)
+; RV32I-NEXT:    lbu s1, 11(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s3, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu a4, 1(a1)
+; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    lbu s10, 0(a1)
+; RV32I-NEXT:    lbu s11, 20(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    lbu ra, 2(a1)
+; RV32I-NEXT:    or a4, a4, s10
+; RV32I-NEXT:    lbu s10, 21(a0)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, t0
-; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    or a4, ra, a4
+; RV32I-NEXT:    lbu ra, 22(a0)
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a0, s5, a0
-; RV32I-NEXT:    or a1, a1, a0
-; RV32I-NEXT:    srl a0, s9, a1
-; RV32I-NEXT:    not s5, a1
-; RV32I-NEXT:    slli t0, s10, 1
-; RV32I-NEXT:    sll t0, t0, s5
-; RV32I-NEXT:    or t0, a0, t0
-; RV32I-NEXT:    addi a0, a1, -224
-; RV32I-NEXT:    sw s10, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sra s6, s10, a1
-; RV32I-NEXT:    mv s10, t0
-; RV32I-NEXT:    sw a0, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz a0, .LBB11_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv s10, s6
-; RV32I-NEXT:  .LBB11_2:
-; RV32I-NEXT:    slli ra, a4, 8
-; RV32I-NEXT:    slli s3, s3, 16
-; RV32I-NEXT:    slli s4, s4, 24
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    slli s7, s7, 16
-; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    srl a4, t4, a1
-; RV32I-NEXT:    slli a0, a3, 1
-; RV32I-NEXT:    sll s11, a0, s5
-; RV32I-NEXT:    or s11, a4, s11
-; RV32I-NEXT:    addi a7, a1, -160
-; RV32I-NEXT:    srl a4, a3, a1
-; RV32I-NEXT:    sw a4, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw a7, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz a7, .LBB11_4
-; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    lw s11, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB11_4:
-; RV32I-NEXT:    slli a4, t1, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or s0, ra, s0
-; RV32I-NEXT:    or s3, s4, s3
-; RV32I-NEXT:    or s1, s1, s2
-; RV32I-NEXT:    or s8, s8, s7
-; RV32I-NEXT:    neg ra, a1
-; RV32I-NEXT:    sll s7, s9, ra
-; RV32I-NEXT:    li s2, 160
-; RV32I-NEXT:    addi s4, a1, -128
-; RV32I-NEXT:    li t1, 64
-; RV32I-NEXT:    sub a7, s2, a1
-; RV32I-NEXT:    sw s7, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgeu s4, t1, .LBB11_6
-; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    slti s2, a7, 0
-; RV32I-NEXT:    neg s2, s2
-; RV32I-NEXT:    and s2, s2, s7
-; RV32I-NEXT:    or s10, s11, s2
-; RV32I-NEXT:  .LBB11_6:
-; RV32I-NEXT:    sw a7, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    or t1, a1, a4
+; RV32I-NEXT:    lbu t0, 23(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu a6, 25(a0)
+; RV32I-NEXT:    lbu a5, 26(a0)
+; RV32I-NEXT:    lbu a1, 30(a0)
+; RV32I-NEXT:    lbu a3, 29(a0)
+; RV32I-NEXT:    lbu a4, 28(a0)
+; RV32I-NEXT:    lbu a0, 27(a0)
+; RV32I-NEXT:    sb a1, 58(sp)
+; RV32I-NEXT:    sb a3, 57(sp)
+; RV32I-NEXT:    sb a4, 56(sp)
+; RV32I-NEXT:    sb a0, 55(sp)
+; RV32I-NEXT:    sb a5, 54(sp)
+; RV32I-NEXT:    sb a6, 53(sp)
+; RV32I-NEXT:    sb a7, 52(sp)
+; RV32I-NEXT:    sb t0, 51(sp)
+; RV32I-NEXT:    sb ra, 50(sp)
+; RV32I-NEXT:    sb s10, 49(sp)
+; RV32I-NEXT:    sb s11, 48(sp)
+; RV32I-NEXT:    sb s9, 47(sp)
+; RV32I-NEXT:    sb s8, 46(sp)
+; RV32I-NEXT:    sb s7, 45(sp)
+; RV32I-NEXT:    sb s6, 44(sp)
+; RV32I-NEXT:    sb s5, 43(sp)
+; RV32I-NEXT:    sb t3, 59(sp)
 ; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or s2, a4, a6
-; RV32I-NEXT:    or t5, t6, t5
-; RV32I-NEXT:    or a6, s3, s0
-; RV32I-NEXT:    or s8, s8, s1
-; RV32I-NEXT:    mv a4, t4
-; RV32I-NEXT:    beqz s4, .LBB11_8
-; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv a4, s10
-; RV32I-NEXT:  .LBB11_8:
-; RV32I-NEXT:    lw a7, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    or t3, t3, t2
-; RV32I-NEXT:    or t1, t5, s2
-; RV32I-NEXT:    srl t2, s8, a1
-; RV32I-NEXT:    slli t5, a6, 1
-; RV32I-NEXT:    sll t5, t5, s5
-; RV32I-NEXT:    or t6, t2, t5
-; RV32I-NEXT:    addi s0, a1, -96
-; RV32I-NEXT:    srl t2, a6, a1
-; RV32I-NEXT:    sw t2, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv t2, t6
-; RV32I-NEXT:    li a7, 64
-; RV32I-NEXT:    bltz s0, .LBB11_10
-; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    lw t2, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB11_10:
-; RV32I-NEXT:    or s3, t3, a5
-; RV32I-NEXT:    addi t5, a1, -32
-; RV32I-NEXT:    srl a5, t1, a1
-; RV32I-NEXT:    sw s0, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw a5, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgez t5, .LBB11_12
-; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    srl a5, s3, a1
-; RV32I-NEXT:    slli t3, t1, 1
-; RV32I-NEXT:    sll t3, t3, s5
-; RV32I-NEXT:    or a5, a5, t3
-; RV32I-NEXT:  .LBB11_12:
-; RV32I-NEXT:    sll s10, s8, ra
-; RV32I-NEXT:    li t3, 32
-; RV32I-NEXT:    sub s0, t3, a1
-; RV32I-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    slti s0, s0, 0
-; RV32I-NEXT:    neg s0, s0
-; RV32I-NEXT:    bgeu a1, a7, .LBB11_14
-; RV32I-NEXT:  # %bb.13:
-; RV32I-NEXT:    and t2, s0, s10
-; RV32I-NEXT:    or t2, a5, t2
-; RV32I-NEXT:  .LBB11_14:
-; RV32I-NEXT:    sw a6, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s2, s3
-; RV32I-NEXT:    beqz a1, .LBB11_16
-; RV32I-NEXT:  # %bb.15:
-; RV32I-NEXT:    mv s2, t2
-; RV32I-NEXT:  .LBB11_16:
-; RV32I-NEXT:    sll a6, t4, ra
-; RV32I-NEXT:    li a5, 96
-; RV32I-NEXT:    sub s7, a5, a1
-; RV32I-NEXT:    slti a5, s7, 0
-; RV32I-NEXT:    neg s11, a5
-; RV32I-NEXT:    li t2, 128
-; RV32I-NEXT:    sub s0, t2, a1
-; RV32I-NEXT:    sltiu a5, s0, 64
-; RV32I-NEXT:    neg a5, a5
-; RV32I-NEXT:    bgeu a1, t2, .LBB11_18
-; RV32I-NEXT:  # %bb.17:
-; RV32I-NEXT:    and a4, s11, a6
-; RV32I-NEXT:    and a4, a5, a4
-; RV32I-NEXT:    or a4, s2, a4
-; RV32I-NEXT:  .LBB11_18:
-; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a5, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    beqz a1, .LBB11_20
-; RV32I-NEXT:  # %bb.19:
-; RV32I-NEXT:    mv s3, a4
-; RV32I-NEXT:  .LBB11_20:
-; RV32I-NEXT:    neg a4, s0
-; RV32I-NEXT:    sub a5, t3, s0
-; RV32I-NEXT:    srl t3, a3, a4
-; RV32I-NEXT:    sw a5, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz a5, .LBB11_23
-; RV32I-NEXT:  # %bb.21:
-; RV32I-NEXT:    mv a0, t3
-; RV32I-NEXT:    bgeu s0, a7, .LBB11_24
-; RV32I-NEXT:  .LBB11_22:
-; RV32I-NEXT:    and a4, s11, s2
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    mv a4, s9
-; RV32I-NEXT:    bnez s0, .LBB11_25
-; RV32I-NEXT:    j .LBB11_26
-; RV32I-NEXT:  .LBB11_23:
-; RV32I-NEXT:    srl a4, t4, a4
-; RV32I-NEXT:    sub a5, a7, s0
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    sll a0, a0, a5
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    bltu s0, a7, .LBB11_22
-; RV32I-NEXT:  .LBB11_24:
-; RV32I-NEXT:    lw a0, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a0, a0, a6
-; RV32I-NEXT:    mv a4, s9
-; RV32I-NEXT:    beqz s0, .LBB11_26
-; RV32I-NEXT:  .LBB11_25:
-; RV32I-NEXT:    mv a4, a0
-; RV32I-NEXT:  .LBB11_26:
-; RV32I-NEXT:    sw t3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bltz t5, .LBB11_28
-; RV32I-NEXT:  # %bb.27:
-; RV32I-NEXT:    lw t6, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:  .LBB11_28:
-; RV32I-NEXT:    mv t3, t0
-; RV32I-NEXT:    lw a0, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a0, .LBB11_30
-; RV32I-NEXT:  # %bb.29:
-; RV32I-NEXT:    mv t3, s6
-; RV32I-NEXT:  .LBB11_30:
-; RV32I-NEXT:    sltiu a5, a1, 64
+; RV32I-NEXT:    sb s4, 42(sp)
+; RV32I-NEXT:    sb s3, 41(sp)
+; RV32I-NEXT:    sb s2, 40(sp)
+; RV32I-NEXT:    sb s1, 39(sp)
+; RV32I-NEXT:    sb s0, 38(sp)
+; RV32I-NEXT:    sb t6, 37(sp)
+; RV32I-NEXT:    sb t5, 36(sp)
+; RV32I-NEXT:    sb t4, 35(sp)
+; RV32I-NEXT:    sb t2, 34(sp)
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 33(sp)
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 32(sp)
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 31(sp)
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 30(sp)
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 29(sp)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    srai a0, a0, 31
-; RV32I-NEXT:    bltu s4, a7, .LBB11_32
-; RV32I-NEXT:  # %bb.31:
-; RV32I-NEXT:    mv t3, a0
-; RV32I-NEXT:  .LBB11_32:
-; RV32I-NEXT:    neg s1, a5
-; RV32I-NEXT:    li a5, 128
-; RV32I-NEXT:    bgeu a1, a5, .LBB11_34
-; RV32I-NEXT:  # %bb.33:
-; RV32I-NEXT:    and a5, s1, t6
-; RV32I-NEXT:    or t3, a5, a4
-; RV32I-NEXT:  .LBB11_34:
-; RV32I-NEXT:    mv a4, s8
-; RV32I-NEXT:    beqz a1, .LBB11_36
-; RV32I-NEXT:  # %bb.35:
-; RV32I-NEXT:    mv a4, t3
-; RV32I-NEXT:  .LBB11_36:
-; RV32I-NEXT:    sw a4, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sub a4, a7, a1
-; RV32I-NEXT:    not t3, a4
-; RV32I-NEXT:    lw a4, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgez a4, .LBB11_38
-; RV32I-NEXT:  # %bb.37:
-; RV32I-NEXT:    lw a4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sll a4, a4, ra
-; RV32I-NEXT:    srli a5, s8, 1
-; RV32I-NEXT:    srl a5, a5, t3
-; RV32I-NEXT:    or s10, a4, a5
-; RV32I-NEXT:  .LBB11_38:
-; RV32I-NEXT:    slti a4, t5, 0
-; RV32I-NEXT:    neg s5, a4
-; RV32I-NEXT:    li t2, 64
-; RV32I-NEXT:    bltu a1, a7, .LBB11_40
-; RV32I-NEXT:  # %bb.39:
-; RV32I-NEXT:    lw a4, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slti a4, a4, 0
-; RV32I-NEXT:    neg a4, a4
-; RV32I-NEXT:    lw a5, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a4, a4, a5
-; RV32I-NEXT:    j .LBB11_41
-; RV32I-NEXT:  .LBB11_40:
-; RV32I-NEXT:    lw a4, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a4, s5, a4
-; RV32I-NEXT:    or a4, a4, s10
-; RV32I-NEXT:  .LBB11_41:
-; RV32I-NEXT:    sw t0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv a7, t1
-; RV32I-NEXT:    beqz a1, .LBB11_43
-; RV32I-NEXT:  # %bb.42:
-; RV32I-NEXT:    mv a7, a4
-; RV32I-NEXT:  .LBB11_43:
-; RV32I-NEXT:    mv s10, t3
-; RV32I-NEXT:    sll a4, a3, ra
-; RV32I-NEXT:    srli s8, t4, 1
-; RV32I-NEXT:    not t3, s0
-; RV32I-NEXT:    mv t0, s7
-; RV32I-NEXT:    bltz s7, .LBB11_45
-; RV32I-NEXT:  # %bb.44:
-; RV32I-NEXT:    mv s7, t5
-; RV32I-NEXT:    mv s11, a6
-; RV32I-NEXT:    j .LBB11_46
-; RV32I-NEXT:  .LBB11_45:
-; RV32I-NEXT:    mv s7, t5
-; RV32I-NEXT:    srl a5, s8, t3
-; RV32I-NEXT:    or s11, a4, a5
-; RV32I-NEXT:  .LBB11_46:
-; RV32I-NEXT:    mv t5, t1
-; RV32I-NEXT:    mv t6, s3
-; RV32I-NEXT:    lw a5, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sll s3, a5, ra
-; RV32I-NEXT:    srli s9, s9, 1
-; RV32I-NEXT:    lw a5, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a5, .LBB11_48
-; RV32I-NEXT:  # %bb.47:
-; RV32I-NEXT:    mv t1, s9
-; RV32I-NEXT:    mv s9, s3
-; RV32I-NEXT:    mv s3, s2
-; RV32I-NEXT:    j .LBB11_49
-; RV32I-NEXT:  .LBB11_48:
-; RV32I-NEXT:    li a5, 192
-; RV32I-NEXT:    sub a5, a5, a1
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    mv t1, s9
-; RV32I-NEXT:    srl a5, s9, a5
-; RV32I-NEXT:    mv s9, s3
-; RV32I-NEXT:    or s3, s3, a5
-; RV32I-NEXT:  .LBB11_49:
-; RV32I-NEXT:    mv a5, s6
-; RV32I-NEXT:    lw ra, 32(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz ra, .LBB11_51
-; RV32I-NEXT:  # %bb.50:
-; RV32I-NEXT:    mv a5, a0
-; RV32I-NEXT:  .LBB11_51:
-; RV32I-NEXT:    bltu s4, t2, .LBB11_53
-; RV32I-NEXT:  # %bb.52:
-; RV32I-NEXT:    mv t2, s2
-; RV32I-NEXT:    j .LBB11_54
-; RV32I-NEXT:  .LBB11_53:
-; RV32I-NEXT:    lw a5, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slti a5, a5, 0
-; RV32I-NEXT:    neg a5, a5
-; RV32I-NEXT:    lw t2, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a5, a5, t2
-; RV32I-NEXT:    lw t2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a5, a5, s3
-; RV32I-NEXT:  .LBB11_54:
-; RV32I-NEXT:    mv s2, s1
-; RV32I-NEXT:    mv ra, s9
-; RV32I-NEXT:    mv s3, a3
-; RV32I-NEXT:    mv s9, t1
-; RV32I-NEXT:    beqz s4, .LBB11_56
-; RV32I-NEXT:  # %bb.55:
-; RV32I-NEXT:    mv s3, a5
-; RV32I-NEXT:  .LBB11_56:
-; RV32I-NEXT:    li a5, 128
-; RV32I-NEXT:    mv t1, t5
-; RV32I-NEXT:    bltu a1, a5, .LBB11_61
-; RV32I-NEXT:  # %bb.57:
-; RV32I-NEXT:    li a7, 64
-; RV32I-NEXT:    bnez a1, .LBB11_62
-; RV32I-NEXT:  .LBB11_58:
-; RV32I-NEXT:    lw a5, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a5, .LBB11_63
-; RV32I-NEXT:  .LBB11_59:
-; RV32I-NEXT:    bltz t0, .LBB11_64
-; RV32I-NEXT:  .LBB11_60:
-; RV32I-NEXT:    mv a4, t2
-; RV32I-NEXT:    j .LBB11_65
-; RV32I-NEXT:  .LBB11_61:
-; RV32I-NEXT:    lw a5, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a5, a5, s11
-; RV32I-NEXT:    or s3, a7, a5
-; RV32I-NEXT:    li a7, 64
-; RV32I-NEXT:    beqz a1, .LBB11_58
-; RV32I-NEXT:  .LBB11_62:
-; RV32I-NEXT:    mv t1, s3
-; RV32I-NEXT:    lw a5, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgez a5, .LBB11_59
-; RV32I-NEXT:  .LBB11_63:
-; RV32I-NEXT:    srl s1, s8, s10
-; RV32I-NEXT:    or a6, a4, s1
-; RV32I-NEXT:    bgez t0, .LBB11_60
-; RV32I-NEXT:  .LBB11_64:
-; RV32I-NEXT:    srl a4, s9, t3
-; RV32I-NEXT:    or a4, ra, a4
-; RV32I-NEXT:  .LBB11_65:
-; RV32I-NEXT:    lw t3, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw t0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgeu s0, a7, .LBB11_67
-; RV32I-NEXT:  # %bb.66:
-; RV32I-NEXT:    lw a5, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slti a5, a5, 0
-; RV32I-NEXT:    neg a5, a5
-; RV32I-NEXT:    and s1, a5, t3
-; RV32I-NEXT:    or a6, a4, s1
-; RV32I-NEXT:  .LBB11_67:
-; RV32I-NEXT:    beqz s0, .LBB11_69
-; RV32I-NEXT:  # %bb.68:
-; RV32I-NEXT:    sw a6, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:  .LBB11_69:
-; RV32I-NEXT:    mv a4, s6
-; RV32I-NEXT:    lw a5, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a5, .LBB11_71
-; RV32I-NEXT:  # %bb.70:
-; RV32I-NEXT:    mv a4, a0
-; RV32I-NEXT:  .LBB11_71:
-; RV32I-NEXT:    li t3, 128
-; RV32I-NEXT:    lw s0, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw a6, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgeu s4, a7, .LBB11_92
-; RV32I-NEXT:  # %bb.72:
-; RV32I-NEXT:    bltu a1, t3, .LBB11_93
-; RV32I-NEXT:  .LBB11_73:
-; RV32I-NEXT:    bnez a1, .LBB11_94
-; RV32I-NEXT:  .LBB11_74:
-; RV32I-NEXT:    mv a4, t0
-; RV32I-NEXT:    bgez s0, .LBB11_95
-; RV32I-NEXT:  .LBB11_75:
-; RV32I-NEXT:    bgez s7, .LBB11_96
-; RV32I-NEXT:  .LBB11_76:
-; RV32I-NEXT:    bltu a1, a7, .LBB11_97
-; RV32I-NEXT:  .LBB11_77:
-; RV32I-NEXT:    bnez a1, .LBB11_98
-; RV32I-NEXT:  .LBB11_78:
-; RV32I-NEXT:    bgeu a1, t3, .LBB11_99
-; RV32I-NEXT:  .LBB11_79:
-; RV32I-NEXT:    lw a4, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz a4, .LBB11_100
-; RV32I-NEXT:  .LBB11_80:
-; RV32I-NEXT:    mv a4, s6
-; RV32I-NEXT:    bgez s0, .LBB11_101
-; RV32I-NEXT:  .LBB11_81:
-; RV32I-NEXT:    bltu a1, a7, .LBB11_102
-; RV32I-NEXT:  .LBB11_82:
-; RV32I-NEXT:    bnez a1, .LBB11_103
-; RV32I-NEXT:  .LBB11_83:
-; RV32I-NEXT:    bgeu a1, t3, .LBB11_104
-; RV32I-NEXT:  .LBB11_84:
-; RV32I-NEXT:    lw a4, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgez s7, .LBB11_105
-; RV32I-NEXT:  .LBB11_85:
-; RV32I-NEXT:    bgeu a1, a7, .LBB11_106
-; RV32I-NEXT:  .LBB11_86:
-; RV32I-NEXT:    bgeu a1, t3, .LBB11_107
-; RV32I-NEXT:  .LBB11_87:
-; RV32I-NEXT:    bgez s7, .LBB11_108
-; RV32I-NEXT:  .LBB11_88:
-; RV32I-NEXT:    bgeu a1, a7, .LBB11_109
-; RV32I-NEXT:  .LBB11_89:
-; RV32I-NEXT:    bltu a1, t3, .LBB11_91
-; RV32I-NEXT:  .LBB11_90:
-; RV32I-NEXT:    mv s6, a0
-; RV32I-NEXT:  .LBB11_91:
-; RV32I-NEXT:    sb s6, 28(a2)
-; RV32I-NEXT:    srli a0, s6, 24
-; RV32I-NEXT:    sb a0, 31(a2)
-; RV32I-NEXT:    srli a0, s6, 16
-; RV32I-NEXT:    sb a0, 30(a2)
-; RV32I-NEXT:    srli a0, s6, 8
-; RV32I-NEXT:    sb a0, 29(a2)
-; RV32I-NEXT:    sb t0, 24(a2)
-; RV32I-NEXT:    srli a0, t0, 24
-; RV32I-NEXT:    sb a0, 27(a2)
-; RV32I-NEXT:    srli a0, t0, 16
-; RV32I-NEXT:    sb a0, 26(a2)
-; RV32I-NEXT:    srli a0, t0, 8
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb t4, 16(a2)
-; RV32I-NEXT:    srli a0, t4, 24
-; RV32I-NEXT:    sb a0, 19(a2)
-; RV32I-NEXT:    srli a0, t4, 16
-; RV32I-NEXT:    sb a0, 18(a2)
-; RV32I-NEXT:    srli a0, t4, 8
-; RV32I-NEXT:    sb a0, 17(a2)
-; RV32I-NEXT:    sb a3, 20(a2)
-; RV32I-NEXT:    srli a0, a3, 24
-; RV32I-NEXT:    sb a0, 23(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    sb a0, 22(a2)
+; RV32I-NEXT:    sb a0, 28(sp)
+; RV32I-NEXT:    srai a0, t3, 31
+; RV32I-NEXT:    sb a0, 88(sp)
+; RV32I-NEXT:    sb a0, 84(sp)
+; RV32I-NEXT:    sb a0, 80(sp)
+; RV32I-NEXT:    sb a0, 76(sp)
+; RV32I-NEXT:    sb a0, 72(sp)
+; RV32I-NEXT:    sb a0, 68(sp)
+; RV32I-NEXT:    sb a0, 64(sp)
+; RV32I-NEXT:    sb a0, 60(sp)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 91(sp)
+; RV32I-NEXT:    srli a3, a0, 16
+; RV32I-NEXT:    sb a3, 90(sp)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 89(sp)
+; RV32I-NEXT:    sb a1, 87(sp)
+; RV32I-NEXT:    sb a3, 86(sp)
+; RV32I-NEXT:    sb a0, 85(sp)
+; RV32I-NEXT:    sb a1, 83(sp)
+; RV32I-NEXT:    sb a3, 82(sp)
+; RV32I-NEXT:    sb a0, 81(sp)
+; RV32I-NEXT:    sb a1, 79(sp)
+; RV32I-NEXT:    sb a3, 78(sp)
+; RV32I-NEXT:    sb a0, 77(sp)
+; RV32I-NEXT:    sb a1, 75(sp)
+; RV32I-NEXT:    sb a3, 74(sp)
+; RV32I-NEXT:    sb a0, 73(sp)
+; RV32I-NEXT:    sb a1, 71(sp)
+; RV32I-NEXT:    sb a3, 70(sp)
+; RV32I-NEXT:    sb a0, 69(sp)
+; RV32I-NEXT:    sb a1, 67(sp)
+; RV32I-NEXT:    sb a3, 66(sp)
+; RV32I-NEXT:    sb a0, 65(sp)
+; RV32I-NEXT:    sb a1, 63(sp)
+; RV32I-NEXT:    sb a3, 62(sp)
+; RV32I-NEXT:    sb a0, 61(sp)
+; RV32I-NEXT:    slli a0, t1, 24
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    addi a3, sp, 28
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    lbu a0, 5(a3)
+; RV32I-NEXT:    lbu a1, 4(a3)
+; RV32I-NEXT:    lbu a4, 6(a3)
+; RV32I-NEXT:    lbu a5, 7(a3)
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    or t4, a5, a0
+; RV32I-NEXT:    andi a4, t1, 7
+; RV32I-NEXT:    lbu a0, 9(a3)
+; RV32I-NEXT:    lbu a1, 8(a3)
+; RV32I-NEXT:    lbu a5, 10(a3)
+; RV32I-NEXT:    lbu a6, 11(a3)
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a0, a5, a0
+; RV32I-NEXT:    or a6, a6, a0
+; RV32I-NEXT:    slli a0, a6, 1
+; RV32I-NEXT:    not t0, a4
+; RV32I-NEXT:    sll a0, a0, t0
+; RV32I-NEXT:    lbu a1, 1(a3)
+; RV32I-NEXT:    lbu a5, 0(a3)
+; RV32I-NEXT:    lbu a7, 2(a3)
+; RV32I-NEXT:    lbu t1, 3(a3)
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or t1, t1, a1
+; RV32I-NEXT:    slli a1, t4, 1
+; RV32I-NEXT:    xori t2, a4, 31
+; RV32I-NEXT:    sll a1, a1, t2
+; RV32I-NEXT:    lbu a5, 13(a3)
+; RV32I-NEXT:    lbu a7, 12(a3)
+; RV32I-NEXT:    lbu t3, 14(a3)
+; RV32I-NEXT:    lbu t5, 15(a3)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    or a5, t3, a5
+; RV32I-NEXT:    or t3, t5, a5
+; RV32I-NEXT:    lbu a5, 17(a3)
+; RV32I-NEXT:    lbu a7, 16(a3)
+; RV32I-NEXT:    lbu t5, 18(a3)
+; RV32I-NEXT:    lbu t6, 19(a3)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a5, t5, a5
+; RV32I-NEXT:    or a5, t6, a5
+; RV32I-NEXT:    slli a7, a5, 1
+; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    lbu t5, 21(a3)
+; RV32I-NEXT:    lbu t6, 20(a3)
+; RV32I-NEXT:    lbu s0, 22(a3)
+; RV32I-NEXT:    lbu s1, 23(a3)
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    or t5, t5, t6
+; RV32I-NEXT:    slli s0, s0, 16
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or t5, s0, t5
+; RV32I-NEXT:    or t5, s1, t5
+; RV32I-NEXT:    lbu t6, 25(a3)
+; RV32I-NEXT:    lbu s0, 24(a3)
+; RV32I-NEXT:    lbu s1, 26(a3)
+; RV32I-NEXT:    lbu s2, 27(a3)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or t6, t6, s0
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    or t6, s1, t6
+; RV32I-NEXT:    or t6, s2, t6
+; RV32I-NEXT:    lbu s0, 29(a3)
+; RV32I-NEXT:    slli s1, t6, 1
+; RV32I-NEXT:    lbu s2, 28(a3)
+; RV32I-NEXT:    sll t0, s1, t0
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    lbu s1, 30(a3)
+; RV32I-NEXT:    or s0, s0, s2
+; RV32I-NEXT:    slli s2, t3, 1
+; RV32I-NEXT:    sll s2, s2, t2
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    lbu a3, 31(a3)
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli s1, t5, 1
+; RV32I-NEXT:    sll s1, s1, t2
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a3, a3, s0
+; RV32I-NEXT:    slli s0, a3, 1
+; RV32I-NEXT:    sll t2, s0, t2
+; RV32I-NEXT:    srl t4, t4, a4
+; RV32I-NEXT:    srl t1, t1, a4
+; RV32I-NEXT:    srl t3, t3, a4
+; RV32I-NEXT:    srl a6, a6, a4
+; RV32I-NEXT:    srl t5, t5, a4
+; RV32I-NEXT:    srl a5, a5, a4
+; RV32I-NEXT:    srl t6, t6, a4
+; RV32I-NEXT:    sra a3, a3, a4
+; RV32I-NEXT:    srli a4, t6, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    or a4, t6, t2
+; RV32I-NEXT:    sb t6, 24(a2)
+; RV32I-NEXT:    srli t2, t6, 8
+; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    srli t2, a3, 24
+; RV32I-NEXT:    sb t2, 31(a2)
+; RV32I-NEXT:    srli t2, a3, 16
+; RV32I-NEXT:    sb t2, 30(a2)
+; RV32I-NEXT:    sb a3, 28(a2)
 ; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 21(a2)
-; RV32I-NEXT:    sb t6, 0(a2)
-; RV32I-NEXT:    sb a6, 12(a2)
-; RV32I-NEXT:    srli a0, t6, 24
-; RV32I-NEXT:    sb a0, 3(a2)
-; RV32I-NEXT:    srli a0, t6, 16
-; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    srli a0, t6, 8
-; RV32I-NEXT:    sb a0, 1(a2)
-; RV32I-NEXT:    sb t1, 4(a2)
-; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    srli a0, a6, 24
-; RV32I-NEXT:    sb a0, 15(a2)
-; RV32I-NEXT:    srli a0, a6, 16
-; RV32I-NEXT:    sb a0, 14(a2)
-; RV32I-NEXT:    srli a0, a6, 8
-; RV32I-NEXT:    sb a0, 13(a2)
-; RV32I-NEXT:    srli a0, t1, 24
+; RV32I-NEXT:    sb a3, 29(a2)
+; RV32I-NEXT:    srli a3, a5, 16
+; RV32I-NEXT:    sb a3, 18(a2)
+; RV32I-NEXT:    or s1, a5, s1
+; RV32I-NEXT:    sb a5, 16(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 17(a2)
+; RV32I-NEXT:    srli a3, t5, 16
+; RV32I-NEXT:    sb a3, 22(a2)
+; RV32I-NEXT:    or a3, t5, t0
+; RV32I-NEXT:    sb t5, 20(a2)
+; RV32I-NEXT:    srli a5, t5, 8
+; RV32I-NEXT:    sb a5, 21(a2)
+; RV32I-NEXT:    srli a5, a6, 16
+; RV32I-NEXT:    sb a5, 10(a2)
+; RV32I-NEXT:    or a5, a6, s2
+; RV32I-NEXT:    sb a6, 8(a2)
+; RV32I-NEXT:    srli a6, a6, 8
+; RV32I-NEXT:    sb a6, 9(a2)
+; RV32I-NEXT:    srli a6, t3, 16
+; RV32I-NEXT:    sb a6, 14(a2)
+; RV32I-NEXT:    or a6, t3, a7
+; RV32I-NEXT:    sb t3, 12(a2)
+; RV32I-NEXT:    srli a7, t3, 8
+; RV32I-NEXT:    sb a7, 13(a2)
+; RV32I-NEXT:    srli a7, t1, 16
+; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    or a1, t1, a1
+; RV32I-NEXT:    sb t1, 0(a2)
+; RV32I-NEXT:    srli a7, t1, 8
+; RV32I-NEXT:    sb a7, 1(a2)
+; RV32I-NEXT:    srli a7, t4, 16
+; RV32I-NEXT:    sb a7, 6(a2)
+; RV32I-NEXT:    or a0, t4, a0
+; RV32I-NEXT:    sb t4, 4(a2)
+; RV32I-NEXT:    srli a7, t4, 8
+; RV32I-NEXT:    sb a7, 5(a2)
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    sb a4, 27(a2)
+; RV32I-NEXT:    srli s1, s1, 24
+; RV32I-NEXT:    sb s1, 19(a2)
+; RV32I-NEXT:    srli a3, a3, 24
+; RV32I-NEXT:    sb a3, 23(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a3, a6, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a1, a1, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    srli a0, t1, 16
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    srli a0, t1, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, a4, 24
-; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a0, a4, 16
-; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    srli a0, a4, 8
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 144
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB11_92:
-; RV32I-NEXT:    mv a4, a0
-; RV32I-NEXT:    bgeu a1, t3, .LBB11_73
-; RV32I-NEXT:  .LBB11_93:
-; RV32I-NEXT:    lw a4, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a4, s5, a4
-; RV32I-NEXT:    and a4, s2, a4
-; RV32I-NEXT:    lw a5, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    beqz a1, .LBB11_74
-; RV32I-NEXT:  .LBB11_94:
-; RV32I-NEXT:    mv a6, a4
-; RV32I-NEXT:    mv a4, t0
-; RV32I-NEXT:    bltz s0, .LBB11_75
-; RV32I-NEXT:  .LBB11_95:
-; RV32I-NEXT:    mv a4, s6
-; RV32I-NEXT:    bltz s7, .LBB11_76
-; RV32I-NEXT:  .LBB11_96:
-; RV32I-NEXT:    lw a5, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    bgeu a1, a7, .LBB11_77
-; RV32I-NEXT:  .LBB11_97:
-; RV32I-NEXT:    lw a4, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a4, a4, t2
-; RV32I-NEXT:    lw a5, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    beqz a1, .LBB11_78
-; RV32I-NEXT:  .LBB11_98:
-; RV32I-NEXT:    mv t4, a4
-; RV32I-NEXT:    bltu a1, t3, .LBB11_79
-; RV32I-NEXT:  .LBB11_99:
-; RV32I-NEXT:    mv t4, a0
-; RV32I-NEXT:    lw a4, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bgez a4, .LBB11_80
-; RV32I-NEXT:  .LBB11_100:
-; RV32I-NEXT:    srl a4, s9, s10
-; RV32I-NEXT:    or t2, ra, a4
-; RV32I-NEXT:    mv a4, s6
-; RV32I-NEXT:    bltz s0, .LBB11_81
-; RV32I-NEXT:  .LBB11_101:
-; RV32I-NEXT:    mv a4, a0
-; RV32I-NEXT:    bgeu a1, a7, .LBB11_82
-; RV32I-NEXT:  .LBB11_102:
-; RV32I-NEXT:    lw a4, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    and a4, s5, a4
-; RV32I-NEXT:    or a4, a4, t2
-; RV32I-NEXT:    beqz a1, .LBB11_83
-; RV32I-NEXT:  .LBB11_103:
-; RV32I-NEXT:    mv a3, a4
-; RV32I-NEXT:    bltu a1, t3, .LBB11_84
-; RV32I-NEXT:  .LBB11_104:
-; RV32I-NEXT:    mv a3, a0
-; RV32I-NEXT:    lw a4, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    bltz s7, .LBB11_85
-; RV32I-NEXT:  .LBB11_105:
-; RV32I-NEXT:    mv t0, s6
-; RV32I-NEXT:    bltu a1, a7, .LBB11_86
-; RV32I-NEXT:  .LBB11_106:
-; RV32I-NEXT:    mv t0, a0
-; RV32I-NEXT:    bltu a1, t3, .LBB11_87
-; RV32I-NEXT:  .LBB11_107:
-; RV32I-NEXT:    mv t0, a0
-; RV32I-NEXT:    bltz s7, .LBB11_88
-; RV32I-NEXT:  .LBB11_108:
-; RV32I-NEXT:    mv s6, a0
-; RV32I-NEXT:    bltu a1, a7, .LBB11_89
-; RV32I-NEXT:  .LBB11_109:
-; RV32I-NEXT:    mv s6, a0
-; RV32I-NEXT:    bgeu a1, t3, .LBB11_90
-; RV32I-NEXT:    j .LBB11_91
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
   %res = ashr i256 %src, %bitOff

diff  --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index d1045108462b2..dccabfd7f7f32 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -177,300 +177,236 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $92, %esp
+; X86-NEXT:    subl $152, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %ebp, %edi
 ; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %ecx, %ebp
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    sbbl %ecx, %esi
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl %ecx, %esi
 ; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    sbbl %ecx, %ebp
+; X86-NEXT:    sbbl %ecx, %edi
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    orl %ebp, %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    bsrl %ebx, %eax
+; X86-NEXT:    bsrl %edi, %eax
 ; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    bsrl %esi, %edx
+; X86-NEXT:    bsrl %ebp, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %eax, %edx
-; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    bsrl %ebx, %eax
 ; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    bsrl %ebp, %ebp
-; X86-NEXT:    xorl $31, %ebp
-; X86-NEXT:    addl $32, %ebp
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    cmovnel %eax, %ebp
-; X86-NEXT:    addl $64, %ebp
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    addl $32, %ecx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    cmovnel %edx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %esi
-; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    cmovnel %eax, %ecx
+; X86-NEXT:    addl $64, %ecx
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %ebp
+; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    bsrl %eax, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    bsrl %ebp, %edi
 ; X86-NEXT:    xorl $31, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    bsrl %ecx, %esi
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    addl $32, %esi
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    testl %ebp, %ebp
 ; X86-NEXT:    cmovnel %edi, %esi
 ; X86-NEXT:    addl $64, %esi
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    cmovnel %edx, %esi
 ; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl %esi, %ebp
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    subl %esi, %ecx
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    movl $127, %edx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ebp, %edx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    sbbl %ebp, %edx
 ; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edx
 ; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    setb %dl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
+; X86-NEXT:    cmovnel %edi, %ebx
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    cmovnel %edi, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmovnel %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovnel %edi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    cmovnel %edi, %eax
-; X86-NEXT:    cmovel %ecx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    jne .LBB4_8
 ; X86-NEXT:  # %bb.1: # %_udiv-special-cases
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    xorl $127, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    je .LBB4_8
 ; X86-NEXT:  # %bb.2: # %udiv-bb1
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorb $127, %cl
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shll %cl, %ebp
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    movb %cl, %ch
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmovnel %edi, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    cmovnel %edx, %ebp
-; X86-NEXT:    cmovnel %edx, %edi
-; X86-NEXT:    subb $64, %ch
-; X86-NEXT:    cmovael %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    negb %ch
-; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    xorb $127, %al
+; X86-NEXT:    movb %al, %ch
+; X86-NEXT:    andb $7, %ch
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $15, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %edi
+; X86-NEXT:    movl 144(%esp,%edi), %edx
+; X86-NEXT:    movl 148(%esp,%edi), %ebx
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %edx, %ebx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    cmovnel %edx, %ebx
-; X86-NEXT:    orl %ebp, %ebx
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X86-NEXT:    addb $-64, %cl
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    movb %cl, (%esp) # 1-byte Spill
-; X86-NEXT:    shll %cl, %ebp
+; X86-NEXT:    shldl %cl, %edx, %ebx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    movl 140(%esp,%edi), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 136(%esp,%edi), %edx
+; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $1, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    cmovnel %edi, %ebp
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X86-NEXT:    cmpb $64, %cl
-; X86-NEXT:    cmovbl %ebx, %ebp
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    cmovael %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    cmovnel %ebx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NEXT:    shldl %cl, %esi, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    testb $32, (%esp) # 1-byte Folded Reload
-; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    cmpb $64, %dl
-; X86-NEXT:    cmovael %ebx, %ecx
-; X86-NEXT:    testb %dl, %dl
-; X86-NEXT:    cmovel %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    cmovel %esi, %ebp
-; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-NEXT:    addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    jae .LBB4_3
 ; X86-NEXT:  # %bb.6:
-; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    jmp .LBB4_7
 ; X86-NEXT:  .LBB4_3: # %udiv-preheader
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %edi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    cmovnel %eax, %ebp
-; X86-NEXT:    cmovnel %eax, %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movb %cl, %ch
-; X86-NEXT:    subb $64, %ch
-; X86-NEXT:    cmovael %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    negb %ch
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    andb $7, %ch
+; X86-NEXT:    movb %dl, %cl
+; X86-NEXT:    shrb $3, %cl
+; X86-NEXT:    andb $15, %cl
+; X86-NEXT:    movzbl %cl, %edx
+; X86-NEXT:    movl 100(%esp,%edx), %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    cmovnel %eax, %esi
-; X86-NEXT:    orl %ebp, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movb %al, %cl
-; X86-NEXT:    addb $-64, %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    shrl %cl, %ebp
+; X86-NEXT:    movl 96(%esp,%edx), %ebp
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    shrdl %cl, %esi, %edx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl 88(%esp,%eax), %ebx
+; X86-NEXT:    movl 92(%esp,%eax), %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    notb %cl
+; X86-NEXT:    addl %ebp, %ebp
+; X86-NEXT:    shll %cl, %ebp
+; X86-NEXT:    orl %eax, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    cmovnel %edi, %ebp
-; X86-NEXT:    cmpb $64, %al
-; X86-NEXT:    cmovbl %esi, %ebp
-; X86-NEXT:    cmovael %edi, %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    cmovnel %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    testb $32, %al
-; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %edx, %edi
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    cmpb $64, %al
-; X86-NEXT:    cmovael %edi, %esi
-; X86-NEXT:    testb %al, %al
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    shrl %cl, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %eax, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -480,129 +416,127 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB4_4: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %ebp
+; X86-NEXT:    shldl $1, %edx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    shldl $1, %edx, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edx
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %eax
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %ebx
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ebx, %ebx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %esi, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbl %ebp, %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl %edi, %esi
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    subl %ecx, %ebx
+; X86-NEXT:    sbbl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    sbbl %esi, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    jne .LBB4_4
 ; X86-NEXT:  # %bb.5:
+; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %esi, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ebx
+; X86-NEXT:    orl %ecx, %ebx
 ; X86-NEXT:    shldl $1, %eax, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    addl %edi, %edi
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:  .LBB4_8: # %udiv-end
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    xorl %ecx, %ebp
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    xorl %ecx, %ebx
 ; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    xorl %ecx, %edx
 ; X86-NEXT:    subl %ecx, %edx
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl %edx, (%edi)
-; X86-NEXT:    movl %eax, 4(%edi)
-; X86-NEXT:    movl %esi, 8(%edi)
-; X86-NEXT:    movl %ebp, 12(%edi)
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    movl %esi, 8(%ecx)
+; X86-NEXT:    movl %ebx, 12(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ecx
@@ -614,49 +548,48 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %eax, %ebp
+; X86-NEXT:    imull %eax, %ecx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull %esi, %edi
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    imull %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    imull %ecx, %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    imull %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    imull %edx, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    mull %edx
+; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    subl (%esp), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebp, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    addl $92, %esp
+; X86-NEXT:    addl $152, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 7280b916c7f51..9212295254e99 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -177,18 +177,18 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $72, %esp
+; X86-NEXT:    subl $132, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    orl %ebp, %eax
-; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    sete (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -197,25 +197,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    bsrl %ebp, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    addl $32, %edx
 ; X86-NEXT:    testl %ebp, %ebp
 ; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    bsrl %edi, %esi
 ; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    bsrl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %esi, %ecx
 ; X86-NEXT:    addl $64, %ecx
-; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:    orl %ebp, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    bsrl %eax, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    bsrl %edx, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    addl $32, %edx
 ; X86-NEXT:    testl %eax, %eax
@@ -230,369 +231,312 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    testl %ebp, %ebp
 ; X86-NEXT:    cmovnel %edi, %esi
 ; X86-NEXT:    addl $64, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    cmovnel %edx, %esi
-; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    subl %esi, %ecx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    movl $127, %edx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %ecx, %edx
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %ebp, %edx
 ; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edx
 ; X86-NEXT:    setb %dl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
+; X86-NEXT:    orb (%esp), %dl # 1-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmovnel %ebp, %ebx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    cmovnel %ebp, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovnel %ebp, %esi
-; X86-NEXT:    cmovel {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    jne .LBB4_8
-; X86-NEXT:  # %bb.1: # %_udiv-special-cases
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    cmovnel %edi, %eax
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    cmovnel %edi, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmovnel %edi, %ebx
+; X86-NEXT:    cmovel {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # %bb.8: # %_udiv-special-cases
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    xorl $127, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    je .LBB4_8
-; X86-NEXT:  # %bb.2: # %udiv-bb1
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorb $127, %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shldl %cl, %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    je .LBB4_9
+; X86-NEXT:  # %bb.5: # %udiv-bb1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    xorb $127, %al
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %al, %ch
+; X86-NEXT:    andb $7, %ch
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $15, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 124(%esp,%eax), %edx
+; X86-NEXT:    movl 128(%esp,%eax), %esi
+; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel %edx, %ebp
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    movb %cl, %ch
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    subb $64, %ch
-; X86-NEXT:    cmovael %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    negb %ch
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    notb %cl
+; X86-NEXT:    movl 120(%esp,%eax), %ebp
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl 116(%esp,%eax), %edi
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    cmovnel %edx, %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movb %al, %cl
-; X86-NEXT:    addb $-64, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %ebp
 ; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    cmovnel %eax, %ebx
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X86-NEXT:    cmpb $64, %cl
-; X86-NEXT:    cmovbl %esi, %ebx
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    cmovael %esi, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    shldl %cl, %esi, %ebp
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    movb %cl, %ch
-; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel %edi, %eax
-; X86-NEXT:    orl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpb $64, %ch
-; X86-NEXT:    cmovael %eax, %ebp
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    testb %ch, %ch
-; X86-NEXT:    cmovel {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    cmovel %esi, %ebx
-; X86-NEXT:    addl $1, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    jae .LBB4_3
+; X86-NEXT:    jae .LBB4_2
 ; X86-NEXT:  # %bb.6:
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    jmp .LBB4_7
-; X86-NEXT:  .LBB4_3: # %udiv-preheader
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    shrdl %cl, %esi, %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    jmp .LBB4_9
+; X86-NEXT:  .LBB4_2: # %udiv-preheader
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    cmovnel %esi, %ebp
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    cmovnel %eax, %edx
-; X86-NEXT:    cmovnel %eax, %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movb %cl, %ch
-; X86-NEXT:    subb $64, %ch
-; X86-NEXT:    cmovael %eax, %esi
-; X86-NEXT:    negb %ch
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %al, %ch
+; X86-NEXT:    andb $7, %ch
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $15, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 80(%esp,%eax), %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 76(%esp,%eax), %edi
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    shrdl %cl, %edx, %ebx
+; X86-NEXT:    movl 68(%esp,%eax), %ebp
+; X86-NEXT:    movl 72(%esp,%eax), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    notb %cl
+; X86-NEXT:    addl %edi, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    shrdl %cl, %edx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    cmovnel %eax, %ebx
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    addb $-64, %al
-; X86-NEXT:    movb %al, %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    cmovnel %edx, %eax
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    cmpb $64, %dl
-; X86-NEXT:    cmovbl %ebx, %eax
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    cmovael %ebx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    cmovnel %ebx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shrdl %cl, %edx, %ebx
-; X86-NEXT:    testb $32, (%esp) # 1-byte Folded Reload
-; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    shrdl %cl, %ebp, %edx
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    cmpb $64, %cl
-; X86-NEXT:    cmovael %edx, %ebx
-; X86-NEXT:    testb %cl, %cl
-; X86-NEXT:    cmovel {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmovel {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB4_4: # %udiv-do-while
+; X86-NEXT:  .LBB4_3: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    shldl $1, %ebx, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    shldl $1, %ebx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    orl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %ebp, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %ecx
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    orl %ebp, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl $1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    addl $-1, %edx
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %ebp, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    addl $-1, %ecx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ebp
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebp, %ecx
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    jne .LBB4_4
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %ebp, %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    jne .LBB4_3
+; X86-NEXT:  # %bb.4:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %ebx, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    shldl $1, %esi, %ebx
-; X86-NEXT:    orl %eax, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    shldl $1, %ebp, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    addl %ebp, %ebp
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    shldl $1, %edi, %ebp
 ; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:  .LBB4_8: # %udiv-end
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    addl %edi, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:  .LBB4_9: # %udiv-end
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edi, (%ecx)
+; X86-NEXT:    movl %ebp, 4(%ecx)
+; X86-NEXT:    movl %esi, 8(%ecx)
+; X86-NEXT:    movl %eax, 12(%ecx)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    imull %ebp, %edx
-; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    imull %edi, %ebp
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull %esi, %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    imull %edi, %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    imull %ebx, %eax
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    imull %edi, %esi
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    subl (%esp), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    sbbl %ebp, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -602,7 +546,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edi, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    addl $72, %esp
+; X86-NEXT:    addl $132, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index 03ea229dc4dda..0b3ef70d2beef 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -12,228 +12,196 @@
 define i256 @test1(i256 %a) nounwind {
 ; ILP-LABEL: test1:
 ; ILP:       # %bb.0:
-; ILP-NEXT:    pushq %r14
-; ILP-NEXT:    pushq %rbx
 ; ILP-NEXT:    movq %rdi, %rax
-; ILP-NEXT:    xorl %r8d, %r8d
-; ILP-NEXT:    addl %esi, %esi
-; ILP-NEXT:    leal 3(%rsi), %edx
-; ILP-NEXT:    movl $1, %r9d
-; ILP-NEXT:    xorl %r10d, %r10d
-; ILP-NEXT:    movl %edx, %ecx
-; ILP-NEXT:    shldq %cl, %r9, %r10
-; ILP-NEXT:    movl $1, %r11d
-; ILP-NEXT:    shlq %cl, %r11
-; ILP-NEXT:    leal -125(%rsi), %edi
-; ILP-NEXT:    xorl %ebx, %ebx
-; ILP-NEXT:    movl %edi, %ecx
-; ILP-NEXT:    shldq %cl, %r9, %rbx
-; ILP-NEXT:    testb $64, %dl
-; ILP-NEXT:    cmovneq %r11, %r10
-; ILP-NEXT:    cmovneq %r8, %r11
-; ILP-NEXT:    movl $1, %r14d
-; ILP-NEXT:    shlq %cl, %r14
-; ILP-NEXT:    movb $125, %cl
-; ILP-NEXT:    subb %sil, %cl
-; ILP-NEXT:    shrdq %cl, %r8, %r9
-; ILP-NEXT:    testb $64, %cl
-; ILP-NEXT:    cmovneq %r8, %r9
-; ILP-NEXT:    testb $64, %dil
-; ILP-NEXT:    cmovneq %r14, %rbx
-; ILP-NEXT:    cmovneq %r8, %r14
-; ILP-NEXT:    testb %dl, %dl
-; ILP-NEXT:    cmovsq %r8, %r10
-; ILP-NEXT:    cmovsq %r8, %r11
+; ILP-NEXT:    leal (%rsi,%rsi), %ecx
+; ILP-NEXT:    addb $3, %cl
+; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    movl %ecx, %edx
+; ILP-NEXT:    shrb $3, %dl
+; ILP-NEXT:    andb $7, %cl
+; ILP-NEXT:    negb %dl
+; ILP-NEXT:    movsbq %dl, %rdx
+; ILP-NEXT:    movq -16(%rsp,%rdx), %rsi
+; ILP-NEXT:    movq -8(%rsp,%rdx), %rdi
+; ILP-NEXT:    shldq %cl, %rsi, %rdi
+; ILP-NEXT:    movq -32(%rsp,%rdx), %r8
+; ILP-NEXT:    movq -24(%rsp,%rdx), %rdx
+; ILP-NEXT:    movq %r8, %r9
+; ILP-NEXT:    shlq %cl, %r9
+; ILP-NEXT:    movq %rdx, %r10
+; ILP-NEXT:    shldq %cl, %r8, %r10
+; ILP-NEXT:    movq %rdi, 24(%rax)
 ; ILP-NEXT:    movq %r10, 8(%rax)
-; ILP-NEXT:    movq %r11, (%rax)
-; ILP-NEXT:    cmovnsq %r8, %rbx
-; ILP-NEXT:    cmoveq %r8, %rbx
-; ILP-NEXT:    movq %rbx, 24(%rax)
-; ILP-NEXT:    cmovnsq %r9, %r14
-; ILP-NEXT:    cmoveq %r8, %r14
-; ILP-NEXT:    movq %r14, 16(%rax)
-; ILP-NEXT:    popq %rbx
-; ILP-NEXT:    popq %r14
+; ILP-NEXT:    movq %r9, (%rax)
+; ILP-NEXT:    shlq %cl, %rsi
+; ILP-NEXT:    notb %cl
+; ILP-NEXT:    shrq %rdx
+; ILP-NEXT:    # kill: def $cl killed $cl killed $ecx
+; ILP-NEXT:    shrq %cl, %rdx
+; ILP-NEXT:    orq %rsi, %rdx
+; ILP-NEXT:    movq %rdx, 16(%rax)
 ; ILP-NEXT:    retq
 ;
 ; HYBRID-LABEL: test1:
 ; HYBRID:       # %bb.0:
-; HYBRID-NEXT:    pushq %rbx
 ; HYBRID-NEXT:    movq %rdi, %rax
+; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; HYBRID-NEXT:    addl %esi, %esi
-; HYBRID-NEXT:    movb $125, %cl
-; HYBRID-NEXT:    subb %sil, %cl
-; HYBRID-NEXT:    xorl %edi, %edi
-; HYBRID-NEXT:    movl $1, %r9d
-; HYBRID-NEXT:    movl $1, %r8d
-; HYBRID-NEXT:    shrdq %cl, %rdi, %r8
-; HYBRID-NEXT:    testb $64, %cl
-; HYBRID-NEXT:    cmovneq %rdi, %r8
-; HYBRID-NEXT:    leal 3(%rsi), %edx
-; HYBRID-NEXT:    xorl %r11d, %r11d
-; HYBRID-NEXT:    movl %edx, %ecx
-; HYBRID-NEXT:    shldq %cl, %r9, %r11
-; HYBRID-NEXT:    addb $-125, %sil
-; HYBRID-NEXT:    xorl %ebx, %ebx
+; HYBRID-NEXT:    addb $3, %sil
 ; HYBRID-NEXT:    movl %esi, %ecx
-; HYBRID-NEXT:    shldq %cl, %r9, %rbx
-; HYBRID-NEXT:    movl $1, %r10d
-; HYBRID-NEXT:    shlq %cl, %r10
-; HYBRID-NEXT:    testb $64, %sil
-; HYBRID-NEXT:    cmovneq %r10, %rbx
-; HYBRID-NEXT:    cmovneq %rdi, %r10
-; HYBRID-NEXT:    movl %edx, %ecx
-; HYBRID-NEXT:    shlq %cl, %r9
-; HYBRID-NEXT:    testb $64, %dl
-; HYBRID-NEXT:    cmovneq %r9, %r11
-; HYBRID-NEXT:    cmovneq %rdi, %r9
-; HYBRID-NEXT:    testb %dl, %dl
-; HYBRID-NEXT:    cmovsq %rdi, %r11
-; HYBRID-NEXT:    movq %r11, 8(%rax)
-; HYBRID-NEXT:    cmovsq %rdi, %r9
-; HYBRID-NEXT:    movq %r9, (%rax)
-; HYBRID-NEXT:    cmovnsq %rdi, %rbx
-; HYBRID-NEXT:    cmoveq %rdi, %rbx
-; HYBRID-NEXT:    movq %rbx, 24(%rax)
-; HYBRID-NEXT:    cmovnsq %r8, %r10
-; HYBRID-NEXT:    cmoveq %rdi, %r10
-; HYBRID-NEXT:    movq %r10, 16(%rax)
-; HYBRID-NEXT:    popq %rbx
+; HYBRID-NEXT:    andb $7, %cl
+; HYBRID-NEXT:    shrb $3, %sil
+; HYBRID-NEXT:    negb %sil
+; HYBRID-NEXT:    movsbq %sil, %rdx
+; HYBRID-NEXT:    movq -16(%rsp,%rdx), %rsi
+; HYBRID-NEXT:    movq -8(%rsp,%rdx), %rdi
+; HYBRID-NEXT:    shldq %cl, %rsi, %rdi
+; HYBRID-NEXT:    movq %rdi, 24(%rax)
+; HYBRID-NEXT:    movq -32(%rsp,%rdx), %rdi
+; HYBRID-NEXT:    movq -24(%rsp,%rdx), %rdx
+; HYBRID-NEXT:    movq %rdx, %r8
+; HYBRID-NEXT:    shldq %cl, %rdi, %r8
+; HYBRID-NEXT:    movq %r8, 8(%rax)
+; HYBRID-NEXT:    shlq %cl, %rdi
+; HYBRID-NEXT:    movq %rdi, (%rax)
+; HYBRID-NEXT:    shlq %cl, %rsi
+; HYBRID-NEXT:    notb %cl
+; HYBRID-NEXT:    shrq %rdx
+; HYBRID-NEXT:    shrq %cl, %rdx
+; HYBRID-NEXT:    orq %rsi, %rdx
+; HYBRID-NEXT:    movq %rdx, 16(%rax)
 ; HYBRID-NEXT:    retq
 ;
 ; BURR-LABEL: test1:
 ; BURR:       # %bb.0:
-; BURR-NEXT:    pushq %rbx
 ; BURR-NEXT:    movq %rdi, %rax
+; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; BURR-NEXT:    addl %esi, %esi
-; BURR-NEXT:    movb $125, %cl
-; BURR-NEXT:    subb %sil, %cl
-; BURR-NEXT:    xorl %edi, %edi
-; BURR-NEXT:    movl $1, %r9d
-; BURR-NEXT:    movl $1, %r8d
-; BURR-NEXT:    shrdq %cl, %rdi, %r8
-; BURR-NEXT:    testb $64, %cl
-; BURR-NEXT:    cmovneq %rdi, %r8
-; BURR-NEXT:    leal 3(%rsi), %edx
-; BURR-NEXT:    xorl %r11d, %r11d
-; BURR-NEXT:    movl %edx, %ecx
-; BURR-NEXT:    shldq %cl, %r9, %r11
-; BURR-NEXT:    addb $-125, %sil
-; BURR-NEXT:    xorl %ebx, %ebx
+; BURR-NEXT:    addb $3, %sil
 ; BURR-NEXT:    movl %esi, %ecx
-; BURR-NEXT:    shldq %cl, %r9, %rbx
-; BURR-NEXT:    movl $1, %r10d
-; BURR-NEXT:    shlq %cl, %r10
-; BURR-NEXT:    testb $64, %sil
-; BURR-NEXT:    cmovneq %r10, %rbx
-; BURR-NEXT:    cmovneq %rdi, %r10
-; BURR-NEXT:    movl %edx, %ecx
-; BURR-NEXT:    shlq %cl, %r9
-; BURR-NEXT:    testb $64, %dl
-; BURR-NEXT:    cmovneq %r9, %r11
-; BURR-NEXT:    cmovneq %rdi, %r9
-; BURR-NEXT:    testb %dl, %dl
-; BURR-NEXT:    cmovsq %rdi, %r11
-; BURR-NEXT:    movq %r11, 8(%rax)
-; BURR-NEXT:    cmovsq %rdi, %r9
-; BURR-NEXT:    movq %r9, (%rax)
-; BURR-NEXT:    cmovnsq %rdi, %rbx
-; BURR-NEXT:    cmoveq %rdi, %rbx
-; BURR-NEXT:    movq %rbx, 24(%rax)
-; BURR-NEXT:    cmovnsq %r8, %r10
-; BURR-NEXT:    cmoveq %rdi, %r10
-; BURR-NEXT:    movq %r10, 16(%rax)
-; BURR-NEXT:    popq %rbx
+; BURR-NEXT:    andb $7, %cl
+; BURR-NEXT:    shrb $3, %sil
+; BURR-NEXT:    negb %sil
+; BURR-NEXT:    movsbq %sil, %rdx
+; BURR-NEXT:    movq -16(%rsp,%rdx), %rsi
+; BURR-NEXT:    movq -8(%rsp,%rdx), %rdi
+; BURR-NEXT:    shldq %cl, %rsi, %rdi
+; BURR-NEXT:    movq %rdi, 24(%rax)
+; BURR-NEXT:    movq -32(%rsp,%rdx), %rdi
+; BURR-NEXT:    movq -24(%rsp,%rdx), %rdx
+; BURR-NEXT:    movq %rdx, %r8
+; BURR-NEXT:    shldq %cl, %rdi, %r8
+; BURR-NEXT:    movq %r8, 8(%rax)
+; BURR-NEXT:    shlq %cl, %rdi
+; BURR-NEXT:    movq %rdi, (%rax)
+; BURR-NEXT:    shlq %cl, %rsi
+; BURR-NEXT:    notb %cl
+; BURR-NEXT:    shrq %rdx
+; BURR-NEXT:    shrq %cl, %rdx
+; BURR-NEXT:    orq %rsi, %rdx
+; BURR-NEXT:    movq %rdx, 16(%rax)
 ; BURR-NEXT:    retq
 ;
 ; SRC-LABEL: test1:
 ; SRC:       # %bb.0:
-; SRC-NEXT:    pushq %rbx
 ; SRC-NEXT:    movq %rdi, %rax
 ; SRC-NEXT:    addl %esi, %esi
-; SRC-NEXT:    leal 3(%rsi), %edx
-; SRC-NEXT:    movb $125, %cl
-; SRC-NEXT:    subb %sil, %cl
-; SRC-NEXT:    xorl %r8d, %r8d
-; SRC-NEXT:    movl $1, %edi
-; SRC-NEXT:    movl $1, %r10d
-; SRC-NEXT:    shrdq %cl, %r8, %r10
-; SRC-NEXT:    testb $64, %cl
-; SRC-NEXT:    cmovneq %r8, %r10
-; SRC-NEXT:    addb $-125, %sil
-; SRC-NEXT:    xorl %r9d, %r9d
-; SRC-NEXT:    movl %esi, %ecx
-; SRC-NEXT:    shldq %cl, %rdi, %r9
-; SRC-NEXT:    xorl %r11d, %r11d
+; SRC-NEXT:    addb $3, %sil
+; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    movl %esi, %edx
+; SRC-NEXT:    andb $7, %dl
+; SRC-NEXT:    shrb $3, %sil
+; SRC-NEXT:    negb %sil
+; SRC-NEXT:    movsbq %sil, %rsi
+; SRC-NEXT:    movq -16(%rsp,%rsi), %rdi
+; SRC-NEXT:    movq %rdi, %r8
+; SRC-NEXT:    movl %edx, %ecx
+; SRC-NEXT:    shlq %cl, %r8
+; SRC-NEXT:    notb %cl
+; SRC-NEXT:    movq -32(%rsp,%rsi), %r9
+; SRC-NEXT:    movq -24(%rsp,%rsi), %r10
+; SRC-NEXT:    movq %r10, %r11
+; SRC-NEXT:    shrq %r11
+; SRC-NEXT:    shrq %cl, %r11
+; SRC-NEXT:    orq %r8, %r11
+; SRC-NEXT:    movq -8(%rsp,%rsi), %rsi
 ; SRC-NEXT:    movl %edx, %ecx
-; SRC-NEXT:    shldq %cl, %rdi, %r11
-; SRC-NEXT:    movl $1, %ebx
-; SRC-NEXT:    shlq %cl, %rbx
-; SRC-NEXT:    testb $64, %dl
-; SRC-NEXT:    cmovneq %rbx, %r11
-; SRC-NEXT:    cmovneq %r8, %rbx
-; SRC-NEXT:    movl %esi, %ecx
+; SRC-NEXT:    shldq %cl, %rdi, %rsi
+; SRC-NEXT:    movq %r9, %rdi
 ; SRC-NEXT:    shlq %cl, %rdi
-; SRC-NEXT:    testb $64, %sil
-; SRC-NEXT:    cmovneq %rdi, %r9
-; SRC-NEXT:    cmovneq %r8, %rdi
-; SRC-NEXT:    testb %dl, %dl
-; SRC-NEXT:    cmovnsq %r10, %rdi
-; SRC-NEXT:    cmoveq %r8, %rdi
-; SRC-NEXT:    cmovnsq %r8, %r9
-; SRC-NEXT:    cmoveq %r8, %r9
-; SRC-NEXT:    cmovsq %r8, %r11
-; SRC-NEXT:    cmovsq %r8, %rbx
-; SRC-NEXT:    movq %r11, 8(%rax)
-; SRC-NEXT:    movq %rbx, (%rax)
-; SRC-NEXT:    movq %r9, 24(%rax)
-; SRC-NEXT:    movq %rdi, 16(%rax)
-; SRC-NEXT:    popq %rbx
+; SRC-NEXT:    shldq %cl, %r9, %r10
+; SRC-NEXT:    movq %rsi, 24(%rax)
+; SRC-NEXT:    movq %r10, 8(%rax)
+; SRC-NEXT:    movq %rdi, (%rax)
+; SRC-NEXT:    movq %r11, 16(%rax)
 ; SRC-NEXT:    retq
 ;
 ; LIN-LABEL: test1:
 ; LIN:       # %bb.0:
 ; LIN-NEXT:    movq %rdi, %rax
-; LIN-NEXT:    xorl %edi, %edi
-; LIN-NEXT:    movl $1, %r8d
-; LIN-NEXT:    addl %esi, %esi
-; LIN-NEXT:    leal 3(%rsi), %ecx
-; LIN-NEXT:    movl $1, %edx
-; LIN-NEXT:    shlq %cl, %rdx
-; LIN-NEXT:    testb $64, %cl
-; LIN-NEXT:    movq %rdx, %r9
-; LIN-NEXT:    cmovneq %rdi, %r9
-; LIN-NEXT:    testb %cl, %cl
-; LIN-NEXT:    cmovsq %rdi, %r9
-; LIN-NEXT:    movq %r9, (%rax)
-; LIN-NEXT:    xorl %r9d, %r9d
-; LIN-NEXT:    # kill: def $cl killed $cl killed $ecx
-; LIN-NEXT:    shldq %cl, %r8, %r9
-; LIN-NEXT:    cmovneq %rdx, %r9
-; LIN-NEXT:    cmovsq %rdi, %r9
-; LIN-NEXT:    movq %r9, 8(%rax)
-; LIN-NEXT:    leal -125(%rsi), %edx
-; LIN-NEXT:    movl $1, %r9d
+; LIN-NEXT:    leal (%rsi,%rsi), %edx
+; LIN-NEXT:    addb $3, %dl
+; LIN-NEXT:    movl %edx, %ecx
+; LIN-NEXT:    shrb $3, %cl
+; LIN-NEXT:    negb %cl
+; LIN-NEXT:    movsbq %cl, %rsi
+; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    movq -32(%rsp,%rsi), %rdi
+; LIN-NEXT:    andb $7, %dl
+; LIN-NEXT:    movq %rdi, %r8
 ; LIN-NEXT:    movl %edx, %ecx
+; LIN-NEXT:    shlq %cl, %r8
+; LIN-NEXT:    movq %r8, (%rax)
+; LIN-NEXT:    movq -24(%rsp,%rsi), %r8
+; LIN-NEXT:    movq %r8, %r9
+; LIN-NEXT:    shldq %cl, %rdi, %r9
+; LIN-NEXT:    movq %r9, 8(%rax)
+; LIN-NEXT:    movq -16(%rsp,%rsi), %rdi
+; LIN-NEXT:    movq %rdi, %r9
 ; LIN-NEXT:    shlq %cl, %r9
-; LIN-NEXT:    testb $64, %dl
-; LIN-NEXT:    movq %r9, %r10
-; LIN-NEXT:    cmovneq %rdi, %r10
-; LIN-NEXT:    movb $125, %cl
-; LIN-NEXT:    subb %sil, %cl
-; LIN-NEXT:    movl $1, %esi
-; LIN-NEXT:    shrdq %cl, %rdi, %rsi
-; LIN-NEXT:    testb $64, %cl
-; LIN-NEXT:    cmovneq %rdi, %rsi
-; LIN-NEXT:    cmovsq %r10, %rsi
-; LIN-NEXT:    cmoveq %rdi, %rsi
-; LIN-NEXT:    movq %rsi, 16(%rax)
-; LIN-NEXT:    xorl %esi, %esi
+; LIN-NEXT:    shrq %r8
+; LIN-NEXT:    notb %cl
+; LIN-NEXT:    shrq %cl, %r8
+; LIN-NEXT:    orq %r9, %r8
+; LIN-NEXT:    movq %r8, 16(%rax)
+; LIN-NEXT:    movq -8(%rsp,%rsi), %rsi
 ; LIN-NEXT:    movl %edx, %ecx
-; LIN-NEXT:    shldq %cl, %r8, %rsi
-; LIN-NEXT:    cmovneq %r9, %rsi
-; LIN-NEXT:    cmovnsq %rdi, %rsi
-; LIN-NEXT:    cmoveq %rdi, %rsi
+; LIN-NEXT:    shldq %cl, %rdi, %rsi
 ; LIN-NEXT:    movq %rsi, 24(%rax)
 ; LIN-NEXT:    retq
   %b = add i256 %a, 1

diff  --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 10064772a8567..aefc4df882c7d 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -13,112 +13,46 @@ define void @test_lshr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $20, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, %esi
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %edi, %esi
-; i686-NEXT:    shrl %cl, %edx
-; i686-NEXT:    shrl %cl, %edi
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    testb $32, %al
-; i686-NEXT:    jne .LBB0_1
-; i686-NEXT:  # %bb.2: # %entry
-; i686-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; i686-NEXT:    jmp .LBB0_3
-; i686-NEXT:  .LBB0_1:
-; i686-NEXT:    movl %edi, %esi
-; i686-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
-; i686-NEXT:    xorl %edi, %edi
-; i686-NEXT:  .LBB0_3: # %entry
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %eax, %edx
-; i686-NEXT:    subb $64, %dl
-; i686-NEXT:    jb .LBB0_5
-; i686-NEXT:  # %bb.4: # %entry
-; i686-NEXT:    xorl %edi, %edi
-; i686-NEXT:  .LBB0_5: # %entry
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    negb %dl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shldl %cl, %ebp, %edi
-; i686-NEXT:    movl %ebp, %esi
-; i686-NEXT:    shll %cl, %esi
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    movl %esi, %ebx
-; i686-NEXT:    jne .LBB0_7
-; i686-NEXT:  # %bb.6: # %entry
-; i686-NEXT:    movl %edi, %ebx
-; i686-NEXT:  .LBB0_7: # %entry
-; i686-NEXT:    movb %al, %ah
-; i686-NEXT:    addb $-64, %ah
+; i686-NEXT:    subl $32, %esp
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movb %ah, %cl
-; i686-NEXT:    shrl %cl, %edi
-; i686-NEXT:    testb $32, %ah
-; i686-NEXT:    movl $0, %ecx
-; i686-NEXT:    jne .LBB0_9
-; i686-NEXT:  # %bb.8: # %entry
-; i686-NEXT:    movl %edi, %ecx
-; i686-NEXT:  .LBB0_9: # %entry
-; i686-NEXT:    cmpb $64, %al
-; i686-NEXT:    jb .LBB0_10
-; i686-NEXT:  # %bb.11: # %entry
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    jmp .LBB0_12
-; i686-NEXT:  .LBB0_10:
-; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT:    orl %ebx, %ecx
-; i686-NEXT:  .LBB0_12: # %entry
-; i686-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    jne .LBB0_14
-; i686-NEXT:  # %bb.13: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB0_14: # %entry
-; i686-NEXT:    movl %ebx, %edx
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, (%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, %eax
+; i686-NEXT:    andb $7, %al
+; i686-NEXT:    shrb $3, %cl
+; i686-NEXT:    andb $15, %cl
+; i686-NEXT:    movzbl %cl, %ebp
+; i686-NEXT:    movl 4(%esp,%ebp), %edx
+; i686-NEXT:    movl %edx, %esi
 ; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    shrdl %cl, %esi, %edx
-; i686-NEXT:    testb $32, %al
-; i686-NEXT:    jne .LBB0_16
-; i686-NEXT:  # %bb.15: # %entry
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB0_16: # %entry
-; i686-NEXT:    movb %ah, %cl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    shrdl %cl, %edx, %ebp
-; i686-NEXT:    testb $32, %ah
-; i686-NEXT:    jne .LBB0_18
-; i686-NEXT:  # %bb.17: # %entry
-; i686-NEXT:    movl %ebp, %edi
-; i686-NEXT:  .LBB0_18: # %entry
-; i686-NEXT:    cmpb $64, %al
-; i686-NEXT:    jae .LBB0_20
-; i686-NEXT:  # %bb.19:
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; i686-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; i686-NEXT:  .LBB0_20: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    testb %al, %al
-; i686-NEXT:    je .LBB0_22
-; i686-NEXT:  # %bb.21: # %entry
-; i686-NEXT:    movl %edi, %ebx
-; i686-NEXT:    movl (%esp), %esi # 4-byte Reload
-; i686-NEXT:  .LBB0_22: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 12(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 8(%ecx)
-; i686-NEXT:    movl %esi, 4(%ecx)
-; i686-NEXT:    movl %ebx, (%ecx)
-; i686-NEXT:    addl $20, %esp
+; i686-NEXT:    shrl %cl, %esi
+; i686-NEXT:    notb %cl
+; i686-NEXT:    movl 8(%esp,%ebp), %ebx
+; i686-NEXT:    leal (%ebx,%ebx), %edi
+; i686-NEXT:    shll %cl, %edi
+; i686-NEXT:    orl %esi, %edi
+; i686-NEXT:    movl (%esp,%ebp), %esi
+; i686-NEXT:    movl 12(%esp,%ebp), %ebp
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    shrdl %cl, %ebp, %ebx
+; i686-NEXT:    shrdl %cl, %edx, %esi
+; i686-NEXT:    shrl %cl, %ebp
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl %ebp, 12(%eax)
+; i686-NEXT:    movl %ebx, 8(%eax)
+; i686-NEXT:    movl %esi, (%eax)
+; i686-NEXT:    movl %edi, 4(%eax)
+; i686-NEXT:    addl $32, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -150,116 +84,47 @@ define void @test_ashr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $24, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, %esi
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %ebx, %esi
-; i686-NEXT:    shrl %cl, %edx
-; i686-NEXT:    movl %ebx, %edi
-; i686-NEXT:    sarl %cl, %edi
-; i686-NEXT:    sarl $31, %ebx
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    testb $32, %al
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    jne .LBB1_1
-; i686-NEXT:  # %bb.2: # %entry
-; i686-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; i686-NEXT:    jmp .LBB1_3
-; i686-NEXT:  .LBB1_1:
-; i686-NEXT:    movl %edi, %esi
-; i686-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
-; i686-NEXT:    movl %ebx, %edi
-; i686-NEXT:  .LBB1_3: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %eax, %edx
-; i686-NEXT:    subb $64, %dl
-; i686-NEXT:    jb .LBB1_5
-; i686-NEXT:  # %bb.4: # %entry
-; i686-NEXT:    movl %ebx, %edi
-; i686-NEXT:  .LBB1_5: # %entry
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    negb %dl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shldl %cl, %ebp, %edi
-; i686-NEXT:    movl %ebp, %esi
-; i686-NEXT:    shll %cl, %esi
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    movl %esi, %ecx
-; i686-NEXT:    jne .LBB1_7
-; i686-NEXT:  # %bb.6: # %entry
-; i686-NEXT:    movl %edi, %ecx
-; i686-NEXT:  .LBB1_7: # %entry
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movb %al, %ah
-; i686-NEXT:    addb $-64, %ah
+; i686-NEXT:    subl $32, %esp
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movb %ah, %cl
-; i686-NEXT:    sarl %cl, %edi
-; i686-NEXT:    testb $32, %ah
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    jne .LBB1_9
-; i686-NEXT:  # %bb.8: # %entry
-; i686-NEXT:    movl %edi, %ecx
-; i686-NEXT:  .LBB1_9: # %entry
-; i686-NEXT:    cmpb $64, %al
-; i686-NEXT:    jb .LBB1_10
-; i686-NEXT:  # %bb.11: # %entry
-; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    jmp .LBB1_12
-; i686-NEXT:  .LBB1_10:
-; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; i686-NEXT:  .LBB1_12: # %entry
-; i686-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    jne .LBB1_14
-; i686-NEXT:  # %bb.13: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB1_14: # %entry
-; i686-NEXT:    movl %ebx, %edx
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, (%esp)
+; i686-NEXT:    sarl $31, %ebx
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, %eax
+; i686-NEXT:    andb $7, %al
+; i686-NEXT:    shrb $3, %cl
+; i686-NEXT:    andb $15, %cl
+; i686-NEXT:    movzbl %cl, %ebp
+; i686-NEXT:    movl 4(%esp,%ebp), %edx
+; i686-NEXT:    movl %edx, %esi
 ; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    shrdl %cl, %esi, %edx
-; i686-NEXT:    testb $32, %al
-; i686-NEXT:    jne .LBB1_16
-; i686-NEXT:  # %bb.15: # %entry
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB1_16: # %entry
-; i686-NEXT:    movb %ah, %cl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    shrdl %cl, %edx, %ebp
-; i686-NEXT:    testb $32, %ah
-; i686-NEXT:    jne .LBB1_18
-; i686-NEXT:  # %bb.17: # %entry
-; i686-NEXT:    movl %ebp, %edi
-; i686-NEXT:  .LBB1_18: # %entry
-; i686-NEXT:    cmpb $64, %al
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    jae .LBB1_20
-; i686-NEXT:  # %bb.19:
-; i686-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; i686-NEXT:    movl %ecx, %edi
-; i686-NEXT:  .LBB1_20: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    testb %al, %al
-; i686-NEXT:    je .LBB1_22
-; i686-NEXT:  # %bb.21: # %entry
-; i686-NEXT:    movl %edi, %ebx
-; i686-NEXT:    movl (%esp), %esi # 4-byte Reload
-; i686-NEXT:  .LBB1_22: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 12(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 8(%ecx)
-; i686-NEXT:    movl %esi, 4(%ecx)
-; i686-NEXT:    movl %ebx, (%ecx)
-; i686-NEXT:    addl $24, %esp
+; i686-NEXT:    shrl %cl, %esi
+; i686-NEXT:    notb %cl
+; i686-NEXT:    movl 8(%esp,%ebp), %ebx
+; i686-NEXT:    leal (%ebx,%ebx), %edi
+; i686-NEXT:    shll %cl, %edi
+; i686-NEXT:    orl %esi, %edi
+; i686-NEXT:    movl (%esp,%ebp), %esi
+; i686-NEXT:    movl 12(%esp,%ebp), %ebp
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    shrdl %cl, %ebp, %ebx
+; i686-NEXT:    shrdl %cl, %edx, %esi
+; i686-NEXT:    sarl %cl, %ebp
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl %ebp, 12(%eax)
+; i686-NEXT:    movl %ebx, 8(%eax)
+; i686-NEXT:    movl %esi, (%eax)
+; i686-NEXT:    movl %edi, 4(%eax)
+; i686-NEXT:    addl $32, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -292,113 +157,51 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $20, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    subl $32, %esp
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, (%esp)
+; i686-NEXT:    movl %ecx, %eax
+; i686-NEXT:    andb $7, %al
+; i686-NEXT:    shrb $3, %cl
+; i686-NEXT:    andb $15, %cl
+; i686-NEXT:    negb %cl
+; i686-NEXT:    movsbl %cl, %ebp
+; i686-NEXT:    movl 24(%esp,%ebp), %edx
+; i686-NEXT:    movl %edx, %ebx
 ; i686-NEXT:    movl %eax, %ecx
 ; i686-NEXT:    shll %cl, %ebx
-; i686-NEXT:    movl %ebp, %esi
-; i686-NEXT:    shll %cl, %esi
-; i686-NEXT:    movl %edi, %edx
-; i686-NEXT:    shldl %cl, %ebp, %edx
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    testb $32, %al
-; i686-NEXT:    jne .LBB2_1
-; i686-NEXT:  # %bb.2: # %entry
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; i686-NEXT:    jmp .LBB2_3
-; i686-NEXT:  .LBB2_1:
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
-; i686-NEXT:    xorl %esi, %esi
-; i686-NEXT:  .LBB2_3: # %entry
-; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %eax, %edx
-; i686-NEXT:    subb $64, %dl
-; i686-NEXT:    jb .LBB2_5
-; i686-NEXT:  # %bb.4: # %entry
-; i686-NEXT:    xorl %esi, %esi
-; i686-NEXT:  .LBB2_5: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    negb %dl
+; i686-NEXT:    notb %cl
+; i686-NEXT:    movl 20(%esp,%ebp), %edi
 ; i686-NEXT:    movl %edi, %esi
-; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    shrl %esi
 ; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    shrdl %cl, %edi, %ebx
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    movl %esi, %ebp
-; i686-NEXT:    jne .LBB2_7
-; i686-NEXT:  # %bb.6: # %entry
-; i686-NEXT:    movl %ebx, %ebp
-; i686-NEXT:  .LBB2_7: # %entry
-; i686-NEXT:    movb %al, %ah
-; i686-NEXT:    addb $-64, %ah
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movb %ah, %cl
-; i686-NEXT:    shll %cl, %ebx
-; i686-NEXT:    testb $32, %ah
-; i686-NEXT:    movl $0, %ecx
-; i686-NEXT:    jne .LBB2_9
-; i686-NEXT:  # %bb.8: # %entry
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:  .LBB2_9: # %entry
-; i686-NEXT:    cmpb $64, %al
-; i686-NEXT:    jb .LBB2_10
-; i686-NEXT:  # %bb.11: # %entry
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    jmp .LBB2_12
-; i686-NEXT:  .LBB2_10:
-; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT:    orl %ebp, %ecx
-; i686-NEXT:  .LBB2_12: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    jne .LBB2_14
-; i686-NEXT:  # %bb.13: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB2_14: # %entry
-; i686-NEXT:    movl %edx, %esi
+; i686-NEXT:    orl %ebx, %esi
+; i686-NEXT:    movl 16(%esp,%ebp), %ebx
+; i686-NEXT:    movl 28(%esp,%ebp), %ebp
 ; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shldl %cl, %ebp, %esi
-; i686-NEXT:    testb $32, %al
-; i686-NEXT:    jne .LBB2_16
-; i686-NEXT:  # %bb.15: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB2_16: # %entry
-; i686-NEXT:    movb %ah, %cl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    shldl %cl, %esi, %edi
-; i686-NEXT:    testb $32, %ah
-; i686-NEXT:    jne .LBB2_18
-; i686-NEXT:  # %bb.17: # %entry
-; i686-NEXT:    movl %edi, %ebx
-; i686-NEXT:  .LBB2_18: # %entry
-; i686-NEXT:    cmpb $64, %al
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    jae .LBB2_20
-; i686-NEXT:  # %bb.19:
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; i686-NEXT:  .LBB2_20: # %entry
+; i686-NEXT:    shldl %cl, %edx, %ebp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    testb %al, %al
-; i686-NEXT:    je .LBB2_22
-; i686-NEXT:  # %bb.21: # %entry
+; i686-NEXT:    movl %ebp, 12(%ecx)
 ; i686-NEXT:    movl %ebx, %edx
-; i686-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; i686-NEXT:  .LBB2_22: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 4(%ecx)
-; i686-NEXT:    movl %esi, (%ecx)
-; i686-NEXT:    movl %edx, 12(%ecx)
-; i686-NEXT:    movl %ebp, 8(%ecx)
-; i686-NEXT:    addl $20, %esp
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    shll %cl, %edx
+; i686-NEXT:    shldl %cl, %ebx, %edi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl %edi, 4(%eax)
+; i686-NEXT:    movl %edx, (%eax)
+; i686-NEXT:    movl %esi, 8(%eax)
+; i686-NEXT:    addl $32, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -464,258 +267,107 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $68, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    subl $100, %esp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl %ebx, %edi
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrl %cl, %edi
-; i686-NEXT:    movl %esi, %ebp
-; i686-NEXT:    shrl %cl, %ebp
-; i686-NEXT:    shrdl %cl, %esi, %edx
-; i686-NEXT:    testb $32, %al
-; i686-NEXT:    jne .LBB6_1
-; i686-NEXT:  # %bb.2: # %entry
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, %ecx
+; i686-NEXT:    andl $7, %ecx
+; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrl $3, %esi
+; i686-NEXT:    andl $15, %esi
+; i686-NEXT:    movl 40(%esp,%esi), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrl %cl, %eax
+; i686-NEXT:    notl %ecx
+; i686-NEXT:    movl 44(%esp,%esi), %edx
 ; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    addl %edx, %edx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shll %cl, %edx
+; i686-NEXT:    orl %eax, %edx
+; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 36(%esp,%esi), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebx, %edx
+; i686-NEXT:    andl $7, %edx
+; i686-NEXT:    shrl $3, %ebx
+; i686-NEXT:    andl $15, %ebx
+; i686-NEXT:    movl 72(%esp,%ebx), %ebp
 ; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    jmp .LBB6_3
-; i686-NEXT:  .LBB6_1:
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:  .LBB6_3: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %ebx, %esi
-; i686-NEXT:    testb $32, %al
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    jne .LBB6_5
-; i686-NEXT:  # %bb.4: # %entry
-; i686-NEXT:    movl %esi, %edi
-; i686-NEXT:  .LBB6_5: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrl %cl, %ebx
 ; i686-NEXT:    shrl %cl, %ebp
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    subl $64, %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    sbbl $0, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    sbbl $0, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    sbbl $0, %ecx
-; i686-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl $0, %ecx
-; i686-NEXT:    jne .LBB6_7
-; i686-NEXT:  # %bb.6: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:  .LBB6_7: # %entry
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    shrdl %cl, %ebp, %esi
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    jne .LBB6_9
-; i686-NEXT:  # %bb.8: # %entry
-; i686-NEXT:    movl %esi, %ebx
-; i686-NEXT:  .LBB6_9: # %entry
-; i686-NEXT:    movl %edi, %esi
-; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    shrl %cl, %ebp
-; i686-NEXT:    testb $32, %cl
-; i686-NEXT:    movl $0, %ecx
-; i686-NEXT:    jne .LBB6_11
-; i686-NEXT:  # %bb.10: # %entry
-; i686-NEXT:    movl %ebp, %ecx
-; i686-NEXT:  .LBB6_11: # %entry
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movb $64, %cl
-; i686-NEXT:    subb %dl, %cl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    shldl %cl, %ebx, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %edi
+; i686-NEXT:    notl %ecx
+; i686-NEXT:    movl 76(%esp,%ebx), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    leal (%eax,%eax), %edi
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    testb $32, %cl
-; i686-NEXT:    movb $64, %bl
-; i686-NEXT:    jne .LBB6_12
-; i686-NEXT:  # %bb.13: # %entry
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    jmp .LBB6_14
-; i686-NEXT:  .LBB6_12:
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:  .LBB6_14: # %entry
-; i686-NEXT:    movl %esi, %edi
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    shrdl %cl, %ebp, %esi
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    jne .LBB6_16
-; i686-NEXT:  # %bb.15: # %entry
+; i686-NEXT:    orl %ebp, %edi
+; i686-NEXT:    movl 48(%esp,%esi), %esi
 ; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB6_16: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    subb %al, %bl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shll %cl, %ebp
-; i686-NEXT:    testb $32, %bl
-; i686-NEXT:    movl $0, %ecx
-; i686-NEXT:    jne .LBB6_18
-; i686-NEXT:  # %bb.17: # %entry
-; i686-NEXT:    movl %ebp, %ecx
-; i686-NEXT:  .LBB6_18: # %entry
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    subl $64, %ecx
-; i686-NEXT:    sbbl $0, %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    sbbl $0, %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    sbbl $0, %esi
-; i686-NEXT:    setae %bh
-; i686-NEXT:    jb .LBB6_20
-; i686-NEXT:  # %bb.19: # %entry
-; i686-NEXT:    xorl %edi, %edi
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:  .LBB6_20: # %entry
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    shrdl %cl, %esi, %edi
-; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    testb $32, %cl
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    jne .LBB6_22
-; i686-NEXT:  # %bb.21: # %entry
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB6_22: # %entry
-; i686-NEXT:    testb %bh, %bh
-; i686-NEXT:    jne .LBB6_24
-; i686-NEXT:  # %bb.23:
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; i686-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB6_24: # %entry
-; i686-NEXT:    testb $32, %cl
-; i686-NEXT:    movl $0, %ecx
-; i686-NEXT:    jne .LBB6_26
-; i686-NEXT:  # %bb.25: # %entry
-; i686-NEXT:    movl %esi, %ecx
-; i686-NEXT:  .LBB6_26: # %entry
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    shldl %cl, %edi, %esi
-; i686-NEXT:    testb $32, %bl
-; i686-NEXT:    jne .LBB6_28
-; i686-NEXT:  # %bb.27: # %entry
-; i686-NEXT:    movl %esi, %ebp
-; i686-NEXT:  .LBB6_28: # %entry
-; i686-NEXT:    testb %bh, %bh
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    jne .LBB6_30
-; i686-NEXT:  # %bb.29:
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    orl %ebp, %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB6_30: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; i686-NEXT:    jne .LBB6_32
-; i686-NEXT:  # %bb.31: # %entry
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:  .LBB6_32: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    shrdl %cl, %ebp, %edi
-; i686-NEXT:    movl %edi, %ebp
-; i686-NEXT:    testb $32, %cl
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; i686-NEXT:    je .LBB6_33
-; i686-NEXT:  # %bb.34: # %entry
-; i686-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; i686-NEXT:    jne .LBB6_35
-; i686-NEXT:  .LBB6_36: # %entry
-; i686-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; i686-NEXT:    je .LBB6_38
-; i686-NEXT:  .LBB6_37:
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB6_38: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    orl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    orl %ecx, %edx
-; i686-NEXT:    je .LBB6_40
-; i686-NEXT:  # %bb.39: # %entry
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    movl 68(%esp,%ebx), %ecx
+; i686-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; i686-NEXT:    movl 80(%esp,%ebx), %esi
+; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:  .LBB6_40: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    orl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    orl %edx, %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    je .LBB6_42
-; i686-NEXT:  # %bb.41: # %entry
+; i686-NEXT:    shrdl %cl, %esi, %ebx
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; i686-NEXT:    shrdl %cl, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; i686-NEXT:    shrl %cl, %ebp
+; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:  .LBB6_42: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 28(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 24(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 12(%ecx)
+; i686-NEXT:    shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    shrl %cl, %esi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; i686-NEXT:    movl %esi, 28(%ecx)
+; i686-NEXT:    movl %ebx, 24(%ecx)
+; i686-NEXT:    movl (%esp), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, 16(%ecx)
+; i686-NEXT:    movl %ebp, 12(%ecx)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; i686-NEXT:    movl %edx, 8(%ecx)
-; i686-NEXT:    movl %esi, 20(%ecx)
-; i686-NEXT:    movl %eax, 16(%ecx)
-; i686-NEXT:    movl %ebx, 4(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; i686-NEXT:    movl %edx, (%ecx)
+; i686-NEXT:    movl %edi, 20(%ecx)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, (%ecx)
-; i686-NEXT:    addl $68, %esp
+; i686-NEXT:    movl %eax, 4(%ecx)
+; i686-NEXT:    addl $100, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
 ; i686-NEXT:    popl %ebp
 ; i686-NEXT:    retl
-; i686-NEXT:  .LBB6_33: # %entry
-; i686-NEXT:    movl %ebp, %edi
-; i686-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; i686-NEXT:    je .LBB6_36
-; i686-NEXT:  .LBB6_35:
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; i686-NEXT:    movl %ecx, %edi
-; i686-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; i686-NEXT:    jne .LBB6_37
-; i686-NEXT:    jmp .LBB6_38
 ;
 ; x86_64-LABEL: test_lshr_v2i128:
 ; x86_64:       # %bb.0: # %entry
@@ -754,261 +406,111 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $80, %esp
+; i686-NEXT:    subl $92, %esp
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    sarl $31, %ebx
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    sarl $31, %eax
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ebp, %ebx
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    sarl %cl, %ebx
-; i686-NEXT:    movl %esi, %edi
-; i686-NEXT:    shrl %cl, %edi
-; i686-NEXT:    shrdl %cl, %esi, %edx
-; i686-NEXT:    sarl $31, %ebp
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    testb $32, %al
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    jne .LBB7_1
-; i686-NEXT:  # %bb.2: # %entry
+; i686-NEXT:    andl $7, %ebx
+; i686-NEXT:    shrl $3, %ebp
+; i686-NEXT:    andl $15, %ebp
+; i686-NEXT:    movl 32(%esp,%ebp), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    shrl %cl, %eax
+; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    notl %ecx
+; i686-NEXT:    movl 36(%esp,%ebp), %edx
 ; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    jmp .LBB7_3
-; i686-NEXT:  .LBB7_1:
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB7_3: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    shrdl %cl, %edx, %edi
-; i686-NEXT:    testb $32, %al
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    jne .LBB7_5
-; i686-NEXT:  # %bb.4: # %entry
-; i686-NEXT:    movl %edi, %ebx
-; i686-NEXT:  .LBB7_5: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, %edi
+; i686-NEXT:    addl %edx, %edx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shll %cl, %edx
+; i686-NEXT:    orl %eax, %edx
+; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %edi, %ecx
+; i686-NEXT:    movl %edi, %edx
+; i686-NEXT:    andl $7, %edx
+; i686-NEXT:    shrl $3, %ecx
+; i686-NEXT:    andl $15, %ecx
+; i686-NEXT:    movl 64(%esp,%ecx), %esi
+; i686-NEXT:    movl %ecx, %edi
+; i686-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    sarl %cl, %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    sarl $31, %ebp
 ; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    subl $64, %ecx
+; i686-NEXT:    notl %ecx
+; i686-NEXT:    movl 68(%esp,%edi), %eax
+; i686-NEXT:    leal (%eax,%eax), %edi
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shll %cl, %edi
+; i686-NEXT:    orl %esi, %edi
+; i686-NEXT:    movl 28(%esp,%ebp), %ecx
 ; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    sbbl $0, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    sbbl $0, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    sbbl $0, %ecx
-; i686-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    movl $0, %esi
+; i686-NEXT:    movl 40(%esp,%ebp), %esi
+; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT:    movl 60(%esp,%ecx), %ebp
 ; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebp, %ecx
-; i686-NEXT:    jne .LBB7_7
-; i686-NEXT:  # %bb.6: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    movl %edi, %ecx
-; i686-NEXT:  .LBB7_7: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    movl 72(%esp,%ecx), %ebp
 ; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    shrdl %cl, %ebp, %esi
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    jne .LBB7_9
-; i686-NEXT:  # %bb.8: # %entry
-; i686-NEXT:    movl %esi, %edi
-; i686-NEXT:  .LBB7_9: # %entry
-; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    shrdl %cl, %ebp, %eax
+; i686-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; i686-NEXT:    sarl %cl, %esi
-; i686-NEXT:    testb $32, %cl
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    jne .LBB7_11
-; i686-NEXT:  # %bb.10: # %entry
-; i686-NEXT:    movl %esi, %ecx
-; i686-NEXT:  .LBB7_11: # %entry
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movb $64, %cl
-; i686-NEXT:    subb %dl, %cl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    shldl %cl, %ebx, %ebp
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ebp
-; i686-NEXT:    shll %cl, %ebp
-; i686-NEXT:    testb $32, %cl
-; i686-NEXT:    movb $64, %bl
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    je .LBB7_13
-; i686-NEXT:  # %bb.12:
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    xorl %ebp, %ebp
-; i686-NEXT:  .LBB7_13: # %entry
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    shrdl %cl, %edi, %esi
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    jne .LBB7_15
-; i686-NEXT:  # %bb.14: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB7_15: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    subb %al, %bl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shll %cl, %ebp
-; i686-NEXT:    testb $32, %bl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    jne .LBB7_17
-; i686-NEXT:  # %bb.16: # %entry
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB7_17: # %entry
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    subl $64, %ecx
-; i686-NEXT:    sbbl $0, %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    sbbl $0, %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    sbbl $0, %esi
-; i686-NEXT:    setae %bh
-; i686-NEXT:    jb .LBB7_19
-; i686-NEXT:  # %bb.18: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB7_19: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    shrdl %cl, %edi, %esi
-; i686-NEXT:    sarl %cl, %edi
-; i686-NEXT:    testb $32, %cl
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    je .LBB7_20
-; i686-NEXT:  # %bb.21: # %entry
-; i686-NEXT:    testb %bh, %bh
-; i686-NEXT:    je .LBB7_22
-; i686-NEXT:  .LBB7_23: # %entry
-; i686-NEXT:    testb $32, %cl
-; i686-NEXT:    jne .LBB7_25
-; i686-NEXT:  .LBB7_24: # %entry
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB7_25: # %entry
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    shldl %cl, %esi, %edi
-; i686-NEXT:    testb $32, %bl
-; i686-NEXT:    jne .LBB7_27
-; i686-NEXT:  # %bb.26: # %entry
-; i686-NEXT:    movl %edi, %ebp
-; i686-NEXT:  .LBB7_27: # %entry
-; i686-NEXT:    testb %bh, %bh
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    jne .LBB7_29
-; i686-NEXT:  # %bb.28:
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    orl %ebp, %ebx
-; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB7_29: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; i686-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; i686-NEXT:    jne .LBB7_31
-; i686-NEXT:  # %bb.30: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB7_31: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    shrdl %cl, %ebp, %ebx
-; i686-NEXT:    testb $32, %cl
-; i686-NEXT:    jne .LBB7_33
-; i686-NEXT:  # %bb.32: # %entry
-; i686-NEXT:    movl %ebx, %esi
-; i686-NEXT:  .LBB7_33: # %entry
-; i686-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    je .LBB7_35
-; i686-NEXT:  # %bb.34:
+; i686-NEXT:    shrdl %cl, %eax, %ebx
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    sarl %cl, %ebp
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl %ebp, 28(%eax)
+; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 24(%eax)
+; i686-NEXT:    movl %ebx, 16(%eax)
+; i686-NEXT:    movl %esi, 12(%eax)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    orl %ebx, %ecx
-; i686-NEXT:    movl %ecx, %esi
-; i686-NEXT:  .LBB7_35: # %entry
-; i686-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; i686-NEXT:    je .LBB7_37
-; i686-NEXT:  # %bb.36:
+; i686-NEXT:    movl %ecx, 8(%eax)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB7_37: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    orl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    orl %ecx, %edx
-; i686-NEXT:    je .LBB7_39
-; i686-NEXT:  # %bb.38: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %ecx, (%eax)
+; i686-NEXT:    movl %edi, 20(%eax)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB7_39: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    orl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    orl %edx, %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    je .LBB7_41
-; i686-NEXT:  # %bb.40: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:  .LBB7_41: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 28(%ecx)
-; i686-NEXT:    movl %edi, 24(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 12(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 8(%ecx)
-; i686-NEXT:    movl %esi, 20(%ecx)
-; i686-NEXT:    movl %eax, 16(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 4(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, (%ecx)
-; i686-NEXT:    addl $80, %esp
+; i686-NEXT:    movl %ecx, 4(%eax)
+; i686-NEXT:    addl $92, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
 ; i686-NEXT:    popl %ebp
 ; i686-NEXT:    retl
-; i686-NEXT:  .LBB7_20: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    testb %bh, %bh
-; i686-NEXT:    jne .LBB7_23
-; i686-NEXT:  .LBB7_22:
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    testb $32, %cl
-; i686-NEXT:    je .LBB7_24
-; i686-NEXT:    jmp .LBB7_25
 ;
 ; x86_64-LABEL: test_ashr_v2i128:
 ; x86_64:       # %bb.0: # %entry
@@ -1050,271 +552,111 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $72, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    subl $100, %esp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shll %cl, %ebp
-; i686-NEXT:    shll %cl, %esi
-; i686-NEXT:    movl %edx, %eax
-; i686-NEXT:    subl $64, %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    sbbl $0, %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    sbbl $0, %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    sbbl $0, %eax
-; i686-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    testb $32, %bl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl $0, %eax
-; i686-NEXT:    movl $0, %ecx
-; i686-NEXT:    jne .LBB8_2
-; i686-NEXT:  # %bb.1: # %entry
-; i686-NEXT:    movl %esi, %eax
-; i686-NEXT:    movl %ebp, %ecx
-; i686-NEXT:  .LBB8_2: # %entry
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %edi, %eax
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    shldl %cl, %edi, %eax
-; i686-NEXT:    testb $32, %bl
-; i686-NEXT:    jne .LBB8_4
-; i686-NEXT:  # %bb.3: # %entry
-; i686-NEXT:    movl %eax, %esi
-; i686-NEXT:  .LBB8_4: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movb $64, %cl
-; i686-NEXT:    subb %bl, %cl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl %edi, %esi
-; i686-NEXT:    shrl %cl, %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    shrdl %cl, %edi, %eax
-; i686-NEXT:    testb $32, %cl
-; i686-NEXT:    jne .LBB8_5
-; i686-NEXT:  # %bb.6: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    jmp .LBB8_7
-; i686-NEXT:  .LBB8_5:
-; i686-NEXT:    movl %esi, %eax
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:  .LBB8_7: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    shldl %cl, %esi, %edi
-; i686-NEXT:    testb $32, %bl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    jne .LBB8_9
-; i686-NEXT:  # %bb.8: # %entry
-; i686-NEXT:    movl %edi, %ebp
-; i686-NEXT:  .LBB8_9: # %entry
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ecx, %ebp
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shll %cl, %ebp
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    shll %cl, %esi
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    movl $0, %edi
-; i686-NEXT:    movl $0, %ecx
-; i686-NEXT:    jne .LBB8_11
-; i686-NEXT:  # %bb.10: # %entry
-; i686-NEXT:    movl %esi, %edi
-; i686-NEXT:    movl %ebp, %ecx
-; i686-NEXT:  .LBB8_11: # %entry
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    shldl %cl, %ebx, %edi
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    jne .LBB8_13
-; i686-NEXT:  # %bb.12: # %entry
-; i686-NEXT:    movl %edi, %ebp
-; i686-NEXT:  .LBB8_13: # %entry
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movb $64, %cl
-; i686-NEXT:    subb %dl, %cl
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    shrl %cl, %ebx
-; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; i686-NEXT:    testb $32, %cl
-; i686-NEXT:    movl $0, %ecx
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    jne .LBB8_15
-; i686-NEXT:  # %bb.14: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:  .LBB8_15: # %entry
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebp, %ecx
+; i686-NEXT:    shrl $3, %ebp
+; i686-NEXT:    andl $15, %ebp
+; i686-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    subl %ebp, %eax
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 8(%eax), %edx
+; i686-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; i686-NEXT:    andl $7, %ecx
 ; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shll %cl, %edx
+; i686-NEXT:    movl 4(%eax), %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrl %esi
+; i686-NEXT:    notl %ecx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shrl %cl, %esi
+; i686-NEXT:    orl %edx, %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT:    movl (%eax), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %ebx, %edx
+; i686-NEXT:    shrl $3, %edx
+; i686-NEXT:    andl $15, %edx
+; i686-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    subl %edx, %esi
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    andl $7, %ebx
+; i686-NEXT:    movl 8(%esi), %edi
 ; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    shldl %cl, %ebp, %edi
-; i686-NEXT:    testb $32, %dl
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    jne .LBB8_17
-; i686-NEXT:  # %bb.16: # %entry
-; i686-NEXT:    movl %edi, %esi
-; i686-NEXT:  .LBB8_17: # %entry
-; i686-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl %ebx, %eax
-; i686-NEXT:    subl $64, %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    sbbl $0, %ecx
-; i686-NEXT:    movl %ebp, %ecx
-; i686-NEXT:    sbbl $0, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    sbbl $0, %ecx
-; i686-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; i686-NEXT:    jb .LBB8_19
-; i686-NEXT:  # %bb.18: # %entry
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:  .LBB8_19: # %entry
-; i686-NEXT:    jb .LBB8_21
-; i686-NEXT:  # %bb.20: # %entry
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:  .LBB8_21: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, %ebx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    shll %cl, %ebx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    shldl %cl, %ebp, %edi
-; i686-NEXT:    testb $32, %cl
 ; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    jne .LBB8_23
-; i686-NEXT:  # %bb.22: # %entry
-; i686-NEXT:    movl %edi, %ecx
-; i686-NEXT:  .LBB8_23: # %entry
+; i686-NEXT:    shll %cl, %edi
+; i686-NEXT:    movl 4(%esi), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrl %eax
+; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    notl %ecx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shrl %cl, %eax
+; i686-NEXT:    orl %edi, %eax
+; i686-NEXT:    movl (%esi), %ecx
 ; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; i686-NEXT:    movl %esi, %edi
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    shll %cl, %edi
 ; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    testb $32, %al
-; i686-NEXT:    movl $0, %edi
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT:    jne .LBB8_25
-; i686-NEXT:  # %bb.24: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; i686-NEXT:  .LBB8_25: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; i686-NEXT:    jne .LBB8_27
-; i686-NEXT:  # %bb.26: # %entry
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB8_27: # %entry
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    movl %ecx, %edi
+; i686-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    negl %ebp
+; i686-NEXT:    movl 64(%esp,%ebp), %esi
+; i686-NEXT:    movl %edi, %ecx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; i686-NEXT:    shldl %cl, %edi, %esi
-; i686-NEXT:    testb $32, %al
-; i686-NEXT:    jne .LBB8_29
-; i686-NEXT:  # %bb.28: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB8_29: # %entry
-; i686-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    jne .LBB8_30
-; i686-NEXT:  # %bb.31: # %entry
-; i686-NEXT:    testb %al, %al
-; i686-NEXT:    je .LBB8_32
-; i686-NEXT:  .LBB8_33: # %entry
-; i686-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; i686-NEXT:    jne .LBB8_35
-; i686-NEXT:  .LBB8_34: # %entry
-; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB8_35: # %entry
-; i686-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    shrdl %cl, %ebx, %esi
-; i686-NEXT:    testb $32, %cl
-; i686-NEXT:    jne .LBB8_37
-; i686-NEXT:  # %bb.36: # %entry
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB8_37: # %entry
-; i686-NEXT:    testb %al, %al
+; i686-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    jne .LBB8_38
-; i686-NEXT:  # %bb.39: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT:    testb %al, %al
-; i686-NEXT:    jne .LBB8_41
-; i686-NEXT:    jmp .LBB8_42
-; i686-NEXT:  .LBB8_30:
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    orl %ebp, %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    testb %al, %al
-; i686-NEXT:    jne .LBB8_33
-; i686-NEXT:  .LBB8_32: # %entry
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; i686-NEXT:    je .LBB8_34
-; i686-NEXT:    jmp .LBB8_35
-; i686-NEXT:  .LBB8_38:
+; i686-NEXT:    movl %esi, %edi
+; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    shll %cl, %edi
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; i686-NEXT:    shldl %cl, %esi, %ebp
+; i686-NEXT:    negl %edx
+; i686-NEXT:    movl 96(%esp,%edx), %edx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; i686-NEXT:    shldl %cl, %ebx, %edx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    testb %al, %al
-; i686-NEXT:    je .LBB8_42
-; i686-NEXT:  .LBB8_41:
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB8_42: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    orl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    orl %eax, %edx
-; i686-NEXT:    je .LBB8_44
-; i686-NEXT:  # %bb.43: # %entry
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:  .LBB8_44: # %entry
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    orl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    orl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    orl %edx, %ebx
-; i686-NEXT:    je .LBB8_46
-; i686-NEXT:  # %bb.45: # %entry
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; i686-NEXT:  .LBB8_46: # %entry
-; i686-NEXT:    movl %esi, 20(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 16(%eax)
+; i686-NEXT:    movl %edx, 28(%ecx)
+; i686-NEXT:    movl %ebp, 20(%ecx)
+; i686-NEXT:    movl %edi, 16(%ecx)
+; i686-NEXT:    movl (%esp), %edx # 4-byte Reload
+; i686-NEXT:    movl %edx, 12(%ecx)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 4(%eax)
+; i686-NEXT:    movl %edx, 4(%ecx)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, (%eax)
-; i686-NEXT:    movl %edi, 28(%eax)
-; i686-NEXT:    movl %ecx, 24(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 12(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 8(%eax)
-; i686-NEXT:    addl $72, %esp
+; i686-NEXT:    movl %edx, (%ecx)
+; i686-NEXT:    movl %eax, 24(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, 8(%ecx)
+; i686-NEXT:    addl $100, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index aa66a9a4a2eb6..2f4071530382b 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -18,221 +18,86 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $32, %esp
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movb %al, %ah
-; CHECK-NEXT:    addb $64, %ah
-; CHECK-NEXT:    movl $1, %edi
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    movb %ah, %cl
-; CHECK-NEXT:    shldl %cl, %edi, %edx
-; CHECK-NEXT:    movl $1, %ebx
-; CHECK-NEXT:    shll %cl, %ebx
-; CHECK-NEXT:    testb $32, %ah
-; CHECK-NEXT:    movl %ebx, %ebp
-; CHECK-NEXT:    jne .LBB1_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    movl %edx, %ebp
-; CHECK-NEXT:  .LBB1_2:
-; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    addb $-128, %dl
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shldl %cl, %edi, %esi
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl $1, %esi
-; CHECK-NEXT:    shll %cl, %esi
-; CHECK-NEXT:    testb $32, %dl
-; CHECK-NEXT:    je .LBB1_4
-; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:  .LBB1_4:
-; CHECK-NEXT:    cmpb $64, %dl
-; CHECK-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
-; CHECK-NEXT:    jb .LBB1_6
-; CHECK-NEXT:  # %bb.5:
-; CHECK-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:  .LBB1_6:
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    testb $32, %ah
-; CHECK-NEXT:    movl $0, %ebp
-; CHECK-NEXT:    jne .LBB1_8
-; CHECK-NEXT:  # %bb.7:
-; CHECK-NEXT:    movl %ebx, %ebp
-; CHECK-NEXT:  .LBB1_8:
-; CHECK-NEXT:    movb $-64, %cl
-; CHECK-NEXT:    subb %al, %cl
-; CHECK-NEXT:    movl $1, %esi
-; CHECK-NEXT:    xorl %ebx, %ebx
-; CHECK-NEXT:    shrdl %cl, %ebx, %esi
-; CHECK-NEXT:    testb $32, %cl
-; CHECK-NEXT:    movl $0, %ebx
-; CHECK-NEXT:    jne .LBB1_10
-; CHECK-NEXT:  # %bb.9:
-; CHECK-NEXT:    movl %esi, %ebx
-; CHECK-NEXT:  .LBB1_10:
-; CHECK-NEXT:    cmpb $64, %dl
-; CHECK-NEXT:    jb .LBB1_12
-; CHECK-NEXT:  # %bb.11:
-; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    movl %ebp, %ebx
-; CHECK-NEXT:  .LBB1_12:
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shldl %cl, %edi, %esi
-; CHECK-NEXT:    movl $1, %ebp
-; CHECK-NEXT:    shll %cl, %ebp
-; CHECK-NEXT:    testb $32, %al
-; CHECK-NEXT:    je .LBB1_14
-; CHECK-NEXT:  # %bb.13:
-; CHECK-NEXT:    movl %ebp, %esi
-; CHECK-NEXT:    xorl %ebp, %ebp
-; CHECK-NEXT:  .LBB1_14:
+; CHECK-NEXT:    subl $92, %esp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movb %al, %ch
+; CHECK-NEXT:    andb $7, %ch
+; CHECK-NEXT:    shrb $3, %al
+; CHECK-NEXT:    negb %al
+; CHECK-NEXT:    movsbl %al, %eax
+; CHECK-NEXT:    movl 68(%esp,%eax), %edx
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movb %ch, %cl
+; CHECK-NEXT:    shll %cl, %edx
+; CHECK-NEXT:    notb %cl
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movl 64(%esp,%eax), %edi
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    shrl %edi
+; CHECK-NEXT:    shrl %cl, %edi
+; CHECK-NEXT:    orl %edx, %edi
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 76(%esp,%eax), %edx
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movb %ch, %cl
+; CHECK-NEXT:    shll %cl, %edx
+; CHECK-NEXT:    movl 72(%esp,%eax), %ebx
 ; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    shrl %ebx
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-NEXT:    shrl %cl, %ebx
+; CHECK-NEXT:    orl %edx, %ebx
+; CHECK-NEXT:    movl 84(%esp,%eax), %esi
 ; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    subb $64, %cl
-; CHECK-NEXT:    jb .LBB1_16
-; CHECK-NEXT:  # %bb.15:
-; CHECK-NEXT:    xorl %ebp, %ebp
-; CHECK-NEXT:  .LBB1_16:
-; CHECK-NEXT:    negb %cl
-; CHECK-NEXT:    movl $1, %esi
-; CHECK-NEXT:    xorl %ebx, %ebx
-; CHECK-NEXT:    shrdl %cl, %ebx, %esi
-; CHECK-NEXT:    testb $32, %cl
-; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    jne .LBB1_18
-; CHECK-NEXT:  # %bb.17:
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:  .LBB1_18:
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    addb $-64, %cl
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    shldl %cl, %edi, %esi
-; CHECK-NEXT:    movl $1, %ebx
-; CHECK-NEXT:    shll %cl, %ebx
-; CHECK-NEXT:    testb $32, %cl
-; CHECK-NEXT:    je .LBB1_20
-; CHECK-NEXT:  # %bb.19:
-; CHECK-NEXT:    movl %ebx, %esi
-; CHECK-NEXT:    xorl %ebx, %ebx
-; CHECK-NEXT:  .LBB1_20:
-; CHECK-NEXT:    cmpb $64, %al
-; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    jb .LBB1_22
-; CHECK-NEXT:  # %bb.21:
-; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:  .LBB1_22:
-; CHECK-NEXT:    testb %dl, %dl
-; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    movl $0, %edx
-; CHECK-NEXT:    je .LBB1_24
-; CHECK-NEXT:  # %bb.23:
-; CHECK-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT:  .LBB1_24:
-; CHECK-NEXT:    movb $-128, %cl
-; CHECK-NEXT:    subb %al, %cl
-; CHECK-NEXT:    movl $1, %ebx
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    shrdl %cl, %esi, %ebx
-; CHECK-NEXT:    testb $32, %cl
-; CHECK-NEXT:    movl $0, %esi
-; CHECK-NEXT:    jne .LBB1_26
-; CHECK-NEXT:  # %bb.25:
-; CHECK-NEXT:    movl %ebx, %esi
-; CHECK-NEXT:  .LBB1_26:
-; CHECK-NEXT:    cmpb $64, %cl
-; CHECK-NEXT:    jb .LBB1_28
-; CHECK-NEXT:  # %bb.27:
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:  .LBB1_28:
-; CHECK-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; CHECK-NEXT:    testb %cl, %cl
-; CHECK-NEXT:    je .LBB1_30
-; CHECK-NEXT:  # %bb.29:
-; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:  .LBB1_30:
-; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    movl $0, %ebp
-; CHECK-NEXT:    jne .LBB1_31
-; CHECK-NEXT:  # %bb.32:
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-NEXT:    js .LBB1_33
-; CHECK-NEXT:  .LBB1_34:
-; CHECK-NEXT:    movl $0, %eax
-; CHECK-NEXT:    jne .LBB1_35
-; CHECK-NEXT:  .LBB1_36:
-; CHECK-NEXT:    movl $0, %edi
-; CHECK-NEXT:    js .LBB1_37
-; CHECK-NEXT:    jmp .LBB1_39
-; CHECK-NEXT:  .LBB1_31:
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; CHECK-NEXT:    movb %ch, %cl
+; CHECK-NEXT:    shll %cl, %esi
+; CHECK-NEXT:    movl 80(%esp,%eax), %ebp
+; CHECK-NEXT:    movl %ebp, %edx
+; CHECK-NEXT:    shrl %edx
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-NEXT:    shrl %cl, %edx
+; CHECK-NEXT:    orl %esi, %edx
+; CHECK-NEXT:    movb %ch, %cl
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    shldl %cl, %esi, %ebp
+; CHECK-NEXT:    movl 60(%esp,%eax), %edi
+; CHECK-NEXT:    movl 88(%esp,%eax), %esi
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-NEXT:    jns .LBB1_34
-; CHECK-NEXT:  .LBB1_33:
-; CHECK-NEXT:    movl $0, %ebp
-; CHECK-NEXT:    movl %eax, %edi
-; CHECK-NEXT:    movl $0, %eax
-; CHECK-NEXT:    je .LBB1_36
-; CHECK-NEXT:  .LBB1_35:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    movl $0, %edi
-; CHECK-NEXT:    jns .LBB1_39
-; CHECK-NEXT:  .LBB1_37:
-; CHECK-NEXT:    je .LBB1_39
-; CHECK-NEXT:  # %bb.38:
-; CHECK-NEXT:    movl %edx, %edi
-; CHECK-NEXT:  .LBB1_39:
-; CHECK-NEXT:    movl $0, %edx
-; CHECK-NEXT:    jns .LBB1_42
-; CHECK-NEXT:  # %bb.40:
-; CHECK-NEXT:    je .LBB1_42
-; CHECK-NEXT:  # %bb.41:
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT:  .LBB1_42:
-; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl $0, %ebx
-; CHECK-NEXT:    je .LBB1_44
-; CHECK-NEXT:  # %bb.43:
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; CHECK-NEXT:  .LBB1_44:
-; CHECK-NEXT:    movl %eax, %ebp
-; CHECK-NEXT:    movl $0, %ecx
-; CHECK-NEXT:    jns .LBB1_46
-; CHECK-NEXT:  # %bb.45:
-; CHECK-NEXT:    movl $0, %ebx
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:  .LBB1_46:
+; CHECK-NEXT:    shldl %cl, %eax, %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl $0, %esi
-; CHECK-NEXT:    je .LBB1_48
-; CHECK-NEXT:  # %bb.47:
-; CHECK-NEXT:    movl %ecx, %esi
-; CHECK-NEXT:  .LBB1_48:
-; CHECK-NEXT:    jns .LBB1_50
-; CHECK-NEXT:  # %bb.49:
-; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
-; CHECK-NEXT:  .LBB1_50:
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 4(%eax)
-; CHECK-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, (%eax)
-; CHECK-NEXT:    movl %esi, 20(%eax)
-; CHECK-NEXT:    movl %ebx, 12(%eax)
-; CHECK-NEXT:    movl %edx, 28(%eax)
-; CHECK-NEXT:    movl %edi, 24(%eax)
-; CHECK-NEXT:    movl %ebp, 16(%eax)
+; CHECK-NEXT:    movl %esi, 28(%eax)
+; CHECK-NEXT:    movl %ebp, 20(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    movl %esi, 12(%eax)
+; CHECK-NEXT:    movl %edi, %esi
+; CHECK-NEXT:    shll %cl, %esi
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; CHECK-NEXT:    shldl %cl, %edi, %ebp
+; CHECK-NEXT:    movl %ebp, 4(%eax)
+; CHECK-NEXT:    movl %esi, (%eax)
+; CHECK-NEXT:    movl %edx, 24(%eax)
+; CHECK-NEXT:    movl %ebx, 16(%eax)
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-NEXT:    movl %ecx, 8(%eax)
-; CHECK-NEXT:    addl $32, %esp
+; CHECK-NEXT:    addl $92, %esp
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    popl %ebx
@@ -241,98 +106,86 @@ define i256 @shift2(i256 %c) nounwind
 ;
 ; CHECK-X64-O0-LABEL: shift2:
 ; CHECK-X64-O0:       # %bb.0:
-; CHECK-X64-O0-NEXT:    pushq %r14
-; CHECK-X64-O0-NEXT:    pushq %rbx
 ; CHECK-X64-O0-NEXT:    movq %rdi, %rax
-; CHECK-X64-O0-NEXT:    movb %sil, %r11b
-; CHECK-X64-O0-NEXT:    movb $-128, %cl
-; CHECK-X64-O0-NEXT:    subb %r11b, %cl
+; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT:    movb %sil, %dl
+; CHECK-X64-O0-NEXT:    movb %dl, %cl
+; CHECK-X64-O0-NEXT:    andb $7, %cl
 ; CHECK-X64-O0-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-X64-O0-NEXT:    xorl %edx, %edx
-; CHECK-X64-O0-NEXT:    movl %edx, %esi
-; CHECK-X64-O0-NEXT:    movl $1, %r14d
-; CHECK-X64-O0-NEXT:    movq %r14, %r8
-; CHECK-X64-O0-NEXT:    shrdq %cl, %rsi, %r8
-; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    testb $64, %cl
-; CHECK-X64-O0-NEXT:    cmovneq %rsi, %r8
-; CHECK-X64-O0-NEXT:    movb %r11b, %bl
-; CHECK-X64-O0-NEXT:    addb $-128, %bl
-; CHECK-X64-O0-NEXT:    movb %bl, %cl
-; CHECK-X64-O0-NEXT:    movq %rsi, %rdx
-; CHECK-X64-O0-NEXT:    shldq %cl, %r14, %rdx
-; CHECK-X64-O0-NEXT:    movb %r11b, %cl
+; CHECK-X64-O0-NEXT:    shrb $3, %dl
+; CHECK-X64-O0-NEXT:    negb %dl
+; CHECK-X64-O0-NEXT:    movsbq %dl, %rdx
+; CHECK-X64-O0-NEXT:    movq -16(%rsp,%rdx), %rsi
 ; CHECK-X64-O0-NEXT:    movq %rsi, %r10
-; CHECK-X64-O0-NEXT:    shldq %cl, %r14, %r10
-; CHECK-X64-O0-NEXT:    movb %r11b, %cl
-; CHECK-X64-O0-NEXT:    movq %r14, %r9
-; CHECK-X64-O0-NEXT:    shlq %cl, %r9
-; CHECK-X64-O0-NEXT:    testb $64, %r11b
-; CHECK-X64-O0-NEXT:    cmovneq %r9, %r10
-; CHECK-X64-O0-NEXT:    cmovneq %rsi, %r9
-; CHECK-X64-O0-NEXT:    movb %bl, %cl
-; CHECK-X64-O0-NEXT:    shlq %cl, %r14
-; CHECK-X64-O0-NEXT:    movq %r14, %rcx
-; CHECK-X64-O0-NEXT:    testb $64, %bl
-; CHECK-X64-O0-NEXT:    cmovneq %rcx, %rdx
-; CHECK-X64-O0-NEXT:    cmovneq %rsi, %rcx
-; CHECK-X64-O0-NEXT:    testb %r11b, %r11b
-; CHECK-X64-O0-NEXT:    cmovnsq %r8, %rcx
-; CHECK-X64-O0-NEXT:    cmoveq %rsi, %rcx
-; CHECK-X64-O0-NEXT:    cmovnsq %rsi, %rdx
-; CHECK-X64-O0-NEXT:    cmoveq %rsi, %rdx
-; CHECK-X64-O0-NEXT:    movq %rsi, %r8
-; CHECK-X64-O0-NEXT:    cmovnsq %r10, %r8
-; CHECK-X64-O0-NEXT:    cmovnsq %r9, %rsi
+; CHECK-X64-O0-NEXT:    shlq %cl, %r10
+; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-X64-O0-NEXT:    notb %cl
+; CHECK-X64-O0-NEXT:    movq -32(%rsp,%rdx), %r9
+; CHECK-X64-O0-NEXT:    movq -24(%rsp,%rdx), %r8
+; CHECK-X64-O0-NEXT:    movq %r8, %r11
+; CHECK-X64-O0-NEXT:    shrq %r11
+; CHECK-X64-O0-NEXT:    shrq %cl, %r11
+; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-X64-O0-NEXT:    orq %r11, %r10
+; CHECK-X64-O0-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-X64-O0-NEXT:    movq -8(%rsp,%rdx), %rdx
+; CHECK-X64-O0-NEXT:    shldq %cl, %rsi, %rdx
+; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-X64-O0-NEXT:    movq %r9, %rsi
+; CHECK-X64-O0-NEXT:    shlq %cl, %rsi
+; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-X64-O0-NEXT:    shldq %cl, %r9, %r8
+; CHECK-X64-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; CHECK-X64-O0-NEXT:    movq %r8, 8(%rdi)
 ; CHECK-X64-O0-NEXT:    movq %rsi, (%rdi)
 ; CHECK-X64-O0-NEXT:    movq %rdx, 24(%rdi)
 ; CHECK-X64-O0-NEXT:    movq %rcx, 16(%rdi)
-; CHECK-X64-O0-NEXT:    popq %rbx
-; CHECK-X64-O0-NEXT:    popq %r14
 ; CHECK-X64-O0-NEXT:    retq
 ;
 ; CHECK-X64-O2-LABEL: shift2:
 ; CHECK-X64-O2:       # %bb.0:
-; CHECK-X64-O2-NEXT:    pushq %rbx
 ; CHECK-X64-O2-NEXT:    movq %rdi, %rax
-; CHECK-X64-O2-NEXT:    movb $-128, %cl
-; CHECK-X64-O2-NEXT:    subb %sil, %cl
-; CHECK-X64-O2-NEXT:    xorl %r8d, %r8d
-; CHECK-X64-O2-NEXT:    movl $1, %edi
-; CHECK-X64-O2-NEXT:    movl $1, %r10d
-; CHECK-X64-O2-NEXT:    shrdq %cl, %r8, %r10
-; CHECK-X64-O2-NEXT:    testb $64, %cl
-; CHECK-X64-O2-NEXT:    cmovneq %r8, %r10
-; CHECK-X64-O2-NEXT:    leal -128(%rsi), %edx
-; CHECK-X64-O2-NEXT:    xorl %r9d, %r9d
+; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movl %esi, %edx
+; CHECK-X64-O2-NEXT:    andb $7, %dl
+; CHECK-X64-O2-NEXT:    shrb $3, %sil
+; CHECK-X64-O2-NEXT:    negb %sil
+; CHECK-X64-O2-NEXT:    movsbq %sil, %rsi
+; CHECK-X64-O2-NEXT:    movq -16(%rsp,%rsi), %rdi
+; CHECK-X64-O2-NEXT:    movq %rdi, %r8
 ; CHECK-X64-O2-NEXT:    movl %edx, %ecx
-; CHECK-X64-O2-NEXT:    shldq %cl, %rdi, %r9
-; CHECK-X64-O2-NEXT:    xorl %r11d, %r11d
-; CHECK-X64-O2-NEXT:    movl %esi, %ecx
-; CHECK-X64-O2-NEXT:    shldq %cl, %rdi, %r11
-; CHECK-X64-O2-NEXT:    movl $1, %ebx
-; CHECK-X64-O2-NEXT:    shlq %cl, %rbx
-; CHECK-X64-O2-NEXT:    testb $64, %sil
-; CHECK-X64-O2-NEXT:    cmovneq %rbx, %r11
-; CHECK-X64-O2-NEXT:    cmovneq %r8, %rbx
+; CHECK-X64-O2-NEXT:    shlq %cl, %r8
+; CHECK-X64-O2-NEXT:    notb %cl
+; CHECK-X64-O2-NEXT:    movq -32(%rsp,%rsi), %r9
+; CHECK-X64-O2-NEXT:    movq -24(%rsp,%rsi), %r10
+; CHECK-X64-O2-NEXT:    movq %r10, %r11
+; CHECK-X64-O2-NEXT:    shrq %r11
+; CHECK-X64-O2-NEXT:    shrq %cl, %r11
+; CHECK-X64-O2-NEXT:    orq %r8, %r11
+; CHECK-X64-O2-NEXT:    movq -8(%rsp,%rsi), %rsi
 ; CHECK-X64-O2-NEXT:    movl %edx, %ecx
+; CHECK-X64-O2-NEXT:    shldq %cl, %rdi, %rsi
+; CHECK-X64-O2-NEXT:    movq %r9, %rdi
 ; CHECK-X64-O2-NEXT:    shlq %cl, %rdi
-; CHECK-X64-O2-NEXT:    testb $64, %dl
-; CHECK-X64-O2-NEXT:    cmovneq %rdi, %r9
-; CHECK-X64-O2-NEXT:    cmovneq %r8, %rdi
-; CHECK-X64-O2-NEXT:    testb %sil, %sil
-; CHECK-X64-O2-NEXT:    cmovnsq %r10, %rdi
-; CHECK-X64-O2-NEXT:    cmoveq %r8, %rdi
-; CHECK-X64-O2-NEXT:    cmovnsq %r8, %r9
-; CHECK-X64-O2-NEXT:    cmoveq %r8, %r9
-; CHECK-X64-O2-NEXT:    cmovsq %r8, %r11
-; CHECK-X64-O2-NEXT:    cmovsq %r8, %rbx
-; CHECK-X64-O2-NEXT:    movq %r11, 8(%rax)
-; CHECK-X64-O2-NEXT:    movq %rbx, (%rax)
-; CHECK-X64-O2-NEXT:    movq %r9, 24(%rax)
-; CHECK-X64-O2-NEXT:    movq %rdi, 16(%rax)
-; CHECK-X64-O2-NEXT:    popq %rbx
+; CHECK-X64-O2-NEXT:    shldq %cl, %r9, %r10
+; CHECK-X64-O2-NEXT:    movq %rsi, 24(%rax)
+; CHECK-X64-O2-NEXT:    movq %r10, 8(%rax)
+; CHECK-X64-O2-NEXT:    movq %rdi, (%rax)
+; CHECK-X64-O2-NEXT:    movq %r11, 16(%rax)
 ; CHECK-X64-O2-NEXT:    retq
 {
   %b = shl i256 1, %c  ; %c must not be a constant

diff  --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index c6d3a5f7a90bd..bf91fb3da2036 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -622,420 +622,42 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X32-NO-BMI2-NO-SHLD-LABEL: lshr_16bytes:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $32, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel (%esp), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $32, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: lshr_16bytes:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: lshr_16bytes:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $36, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $36, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_16bytes:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X32-LABEL: lshr_16bytes:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl (%edx), %esi
+; X32-NEXT:    movl 4(%edx), %edi
+; X32-NEXT:    movl 8(%edx), %ebx
+; X32-NEXT:    movl 12(%edx), %edx
+; X32-NEXT:    movzbl (%ecx), %ecx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl $15, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %edx
+; X32-NEXT:    movl 4(%esp,%ecx), %esi
+; X32-NEXT:    movl 12(%esp,%ecx), %edi
+; X32-NEXT:    movl 8(%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, 8(%eax)
+; X32-NEXT:    movl %edi, 12(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    movl %esi, 4(%eax)
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
   %bitOff = shl i128 %byteOff, 3
@@ -1120,436 +742,44 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X32-NO-BMI2-NO-SHLD-LABEL: shl_16bytes:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $36, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $36, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: shl_16bytes:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $36, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edi, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ebx), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $36, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X32-LABEL: shl_16bytes:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl (%edx), %esi
+; X32-NEXT:    movl 4(%edx), %edi
+; X32-NEXT:    movl 8(%edx), %ebx
+; X32-NEXT:    movl 12(%edx), %edx
+; X32-NEXT:    movzbl (%ecx), %ecx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, (%esp)
+; X32-NEXT:    andb $15, %cl
+; X32-NEXT:    negb %cl
+; X32-NEXT:    movsbl %cl, %ecx
+; X32-NEXT:    movl 16(%esp,%ecx), %edx
+; X32-NEXT:    movl 20(%esp,%ecx), %esi
+; X32-NEXT:    movl 28(%esp,%ecx), %edi
+; X32-NEXT:    movl 24(%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, 8(%eax)
+; X32-NEXT:    movl %edi, 12(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    movl %esi, 4(%eax)
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
   %bitOff = shl i128 %byteOff, 3
@@ -1634,433 +864,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X32-NO-BMI2-NO-SHLD-LABEL: ashr_16bytes:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $36, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $36, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: ashr_16bytes:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: ashr_16bytes:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ecx, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel (%esp), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_16bytes:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $36, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %edx, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ebx, %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $36, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X32-LABEL: ashr_16bytes:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl (%edx), %esi
+; X32-NEXT:    movl 4(%edx), %edi
+; X32-NEXT:    movl 8(%edx), %ebx
+; X32-NEXT:    movl 12(%edx), %edx
+; X32-NEXT:    movzbl (%ecx), %ecx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    sarl $31, %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl $15, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %edx
+; X32-NEXT:    movl 4(%esp,%ecx), %esi
+; X32-NEXT:    movl 12(%esp,%ecx), %edi
+; X32-NEXT:    movl 8(%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, 8(%eax)
+; X32-NEXT:    movl %edi, 12(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    movl %esi, 4(%eax)
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
   %bitOff = shl i128 %byteOff, 3
@@ -2070,1948 +910,98 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 }
 
 define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdx, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r14, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbp, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %al, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rax), %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdx, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %r12, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %rbp, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r11, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r11, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%r9)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 16(%r9)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%r9)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%r9)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r14, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r13, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r12, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %r9d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %r10, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rax, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rcx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r9d, %r13d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r12, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r14, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r15, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %r9b, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r13d, %r15d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r15b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r15, %rbp, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r10, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %rbx, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%r9), %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r11d, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r12b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r14, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r11, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r11, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r11b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %r9b, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r13, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r15, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rsi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rax, %rdi, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rax, %r11, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r8, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r13, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r11, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r12, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r13, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $136, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ecx), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebx), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebx), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebx), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esi), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-128, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bl, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 12(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $136, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $120, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb (%eax), %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %eax, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %eax, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebp), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebp), %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-128, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esi), %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $120, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $160, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esi), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esi), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ecx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-128, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esi), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, (%esp) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 28(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 20(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $160, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $128, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-128, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edi, %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %esi, %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %dl, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $128, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: lshr_32bytes:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    movq 16(%rdi), %r8
+; X64-NEXT:    movq 24(%rdi), %rdi
+; X64-NEXT:    movzbl (%rsi), %esi
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $31, %esi
+; X64-NEXT:    movq -64(%rsp,%rsi), %rax
+; X64-NEXT:    movq -56(%rsp,%rsi), %rcx
+; X64-NEXT:    movq -40(%rsp,%rsi), %rdi
+; X64-NEXT:    movq -48(%rsp,%rsi), %rsi
+; X64-NEXT:    movq %rsi, 16(%rdx)
+; X64-NEXT:    movq %rdi, 24(%rdx)
+; X64-NEXT:    movq %rax, (%rdx)
+; X64-NEXT:    movq %rcx, 8(%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: lshr_32bytes:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    subl $72, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 4(%eax), %ecx
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl 8(%eax), %esi
+; X32-NEXT:    movl 12(%eax), %edi
+; X32-NEXT:    movl 16(%eax), %ebx
+; X32-NEXT:    movl 20(%eax), %ebp
+; X32-NEXT:    movl 24(%eax), %edx
+; X32-NEXT:    movl 28(%eax), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movzbl (%eax), %eax
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl $31, %eax
+; X32-NEXT:    movl 8(%esp,%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 12(%esp,%eax), %ecx
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl 20(%esp,%eax), %esi
+; X32-NEXT:    movl 16(%esp,%eax), %edi
+; X32-NEXT:    movl 28(%esp,%eax), %ebx
+; X32-NEXT:    movl 24(%esp,%eax), %ebp
+; X32-NEXT:    movl 36(%esp,%eax), %edx
+; X32-NEXT:    movl 32(%esp,%eax), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %ecx, 24(%eax)
+; X32-NEXT:    movl %edx, 28(%eax)
+; X32-NEXT:    movl %ebp, 16(%eax)
+; X32-NEXT:    movl %ebx, 20(%eax)
+; X32-NEXT:    movl %edi, 8(%eax)
+; X32-NEXT:    movl %esi, 12(%eax)
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, 4(%eax)
+; X32-NEXT:    addl $72, %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
   %bitOff = shl i256 %byteOff, 3
@@ -4020,2001 +1010,102 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   ret void
 }
 define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r10d, %r10d
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbp, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %al, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdx,%rdx), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rax), %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdx, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %rbx, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %rbp, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r10, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r10, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, (%r9)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 8(%r9)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 24(%r9)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 16(%r9)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r14, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r12, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r11, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r14, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r9, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %rax, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %rcx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r8d, %r13d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r12, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r10, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r14, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r15, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %r8b, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r13d, %r15d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r15b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r15, %rbp, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r9, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r10, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%r8), %ebx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r12b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r12, %r14, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %bl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %r8b, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r13, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r15, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rsi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rsi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r13, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %r8, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbx, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %r10, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r12, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r14, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $140, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ecx), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebx), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebx), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebx), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, (%esp) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-128, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ah # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %edx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %ch, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esi), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ah # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $64, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-64, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %edx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %dl, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %al # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 12(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 16(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $140, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $116, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-128, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esi), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esi), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 24(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 28(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $116, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $164, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ecx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-128, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 28(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 16(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 20(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $164, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $128, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ebp), %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ebp), %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebx), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-128, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebp, %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $al killed $al killed $eax def $eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esi), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esi), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %bl, %ch
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %ch
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %ch, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %esi, %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $128, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: shl_32bytes:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    movq 16(%rdi), %r8
+; X64-NEXT:    movq 24(%rdi), %rdi
+; X64-NEXT:    movzbl (%rsi), %esi
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andb $31, %sil
+; X64-NEXT:    negb %sil
+; X64-NEXT:    movsbq %sil, %rax
+; X64-NEXT:    movq -32(%rsp,%rax), %rcx
+; X64-NEXT:    movq -24(%rsp,%rax), %rsi
+; X64-NEXT:    movq -8(%rsp,%rax), %rdi
+; X64-NEXT:    movq -16(%rsp,%rax), %rax
+; X64-NEXT:    movq %rax, 16(%rdx)
+; X64-NEXT:    movq %rdi, 24(%rdx)
+; X64-NEXT:    movq %rcx, (%rdx)
+; X64-NEXT:    movq %rsi, 8(%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: shl_32bytes:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    subl $72, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl (%edx), %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 4(%edx), %ecx
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl 8(%edx), %edi
+; X32-NEXT:    movl 12(%edx), %ebx
+; X32-NEXT:    movl 16(%edx), %ebp
+; X32-NEXT:    movzbl (%eax), %eax
+; X32-NEXT:    movl 20(%edx), %esi
+; X32-NEXT:    movl 24(%edx), %ecx
+; X32-NEXT:    movl 28(%edx), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    andb $31, %al
+; X32-NEXT:    negb %al
+; X32-NEXT:    movsbl %al, %eax
+; X32-NEXT:    movl 40(%esp,%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 44(%esp,%eax), %ecx
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl 52(%esp,%eax), %esi
+; X32-NEXT:    movl 48(%esp,%eax), %edi
+; X32-NEXT:    movl 60(%esp,%eax), %ebx
+; X32-NEXT:    movl 56(%esp,%eax), %ebp
+; X32-NEXT:    movl 68(%esp,%eax), %edx
+; X32-NEXT:    movl 64(%esp,%eax), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %ecx, 24(%eax)
+; X32-NEXT:    movl %edx, 28(%eax)
+; X32-NEXT:    movl %ebp, 16(%eax)
+; X32-NEXT:    movl %ebx, 20(%eax)
+; X32-NEXT:    movl %edi, 8(%eax)
+; X32-NEXT:    movl %esi, 12(%eax)
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, 4(%eax)
+; X32-NEXT:    addl $72, %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
   %bitOff = shl i256 %byteOff, 3
@@ -6023,1996 +1114,100 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   ret void
 }
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r12,%r12), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r14, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rdx, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %al, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdx, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rdx, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rax), %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %rdx, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %r13, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r12, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r11, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r11, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%r10)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 16(%r10)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%r10)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%r10)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r14, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r13, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r12, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r8, %rcx, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rax, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarq $63, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %r8, %r10, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r12, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r8, %rbx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r8d, %ebp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bpl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbp, %r13, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r15, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r8, %r14, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbp, %r15, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %r8b, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r12b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r12, %rbp, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r14, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rax, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rbp, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%r8), %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r11d, %r13d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r15, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r11, %r14, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %r11, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r11b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %r8b, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %rax, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r12, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rsi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rsi, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rax, %rdi, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r12d, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rax, %r11, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r13, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r13, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r8, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %r11, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r13, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %rbp, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $144, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ecx), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esi), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-128, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %ah, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bh, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 12(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $144, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $124, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb (%eax), %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-128, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %dh, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %dh, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $124, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $168, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ecx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-128, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ecx, %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edi, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl def $ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 28(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 20(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $168, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $132, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ebx, %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebp), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-128, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %eax, %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ebx, %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edi, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %cl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %dl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $132, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: ashr_32bytes:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    movq 16(%rdi), %r8
+; X64-NEXT:    movq 24(%rdi), %rdi
+; X64-NEXT:    movzbl (%rsi), %esi
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    sarq $63, %rdi
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $31, %esi
+; X64-NEXT:    movq -64(%rsp,%rsi), %rax
+; X64-NEXT:    movq -56(%rsp,%rsi), %rcx
+; X64-NEXT:    movq -40(%rsp,%rsi), %rdi
+; X64-NEXT:    movq -48(%rsp,%rsi), %rsi
+; X64-NEXT:    movq %rsi, 16(%rdx)
+; X64-NEXT:    movq %rdi, 24(%rdx)
+; X64-NEXT:    movq %rax, (%rdx)
+; X64-NEXT:    movq %rcx, 8(%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: ashr_32bytes:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    subl $72, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 4(%eax), %ecx
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl 8(%eax), %edi
+; X32-NEXT:    movl 12(%eax), %ebx
+; X32-NEXT:    movl 16(%eax), %ebp
+; X32-NEXT:    movl 20(%eax), %esi
+; X32-NEXT:    movl 24(%eax), %edx
+; X32-NEXT:    movl 28(%eax), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movzbl (%eax), %eax
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    sarl $31, %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl $31, %eax
+; X32-NEXT:    movl 8(%esp,%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 12(%esp,%eax), %ecx
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl 20(%esp,%eax), %esi
+; X32-NEXT:    movl 16(%esp,%eax), %edi
+; X32-NEXT:    movl 28(%esp,%eax), %ebx
+; X32-NEXT:    movl 24(%esp,%eax), %ebp
+; X32-NEXT:    movl 36(%esp,%eax), %edx
+; X32-NEXT:    movl 32(%esp,%eax), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %ecx, 24(%eax)
+; X32-NEXT:    movl %edx, 28(%eax)
+; X32-NEXT:    movl %ebp, 16(%eax)
+; X32-NEXT:    movl %ebx, 20(%eax)
+; X32-NEXT:    movl %edi, 8(%eax)
+; X32-NEXT:    movl %esi, 12(%eax)
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, 4(%eax)
+; X32-NEXT:    addl $72, %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
   %bitOff = shl i256 %byteOff, 3
@@ -8022,9 +1217,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X32: {{.*}}
 ; X32-NO-SHLD: {{.*}}
 ; X32-SHLD: {{.*}}
-; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
 ; X64-SHLD: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index c06cd5b4477fe..3b37dead8a77f 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -588,113 +588,61 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $36, %esp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $15, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl %ah, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebp), %esi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ebp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebp), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $32, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%ebp), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl (%esp), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $36, %esp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -710,83 +658,46 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb (%eax), %ah
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %ah
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $15, %ah
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %ah, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebp), %edx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebp), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebp), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%ecx)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
@@ -802,95 +713,48 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $32, %esp
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $15, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %bl, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel (%esp), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%esi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%esi)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -904,88 +768,47 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $36, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $36, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $15, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%edx), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%edx), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%edx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%ebp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%ebp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 4(%ebp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -1076,110 +899,66 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $15, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movsbl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ebp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ebp), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebp), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebp), %edi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $32, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1192,96 +971,50 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $36, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, (%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $15, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp), %edx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp), %esi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %ebp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $36, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -1294,101 +1027,52 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $32, %esp
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ebx), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, (%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $15, %cl
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %cl, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %ebp
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel (%esp), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, 28(%esp,%edx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1401,88 +1085,49 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $36, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $36, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, (%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $15, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%esi), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%esi), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%esi), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%esi), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 12(%ebp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%ebp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -1575,106 +1220,59 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    subl $36, %esp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esi), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $15, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebp), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ebp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebp), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%ebp), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl (%esp), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%edx)
 ; X32-NO-BMI2-NO-SHLD-NEXT:    addl $36, %esp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -1691,85 +1289,46 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ebx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $15, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebp), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebp), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%ecx)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %ebp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
@@ -1784,101 +1343,52 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $36, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $32, %esp
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ebx, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ecx, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel (%esp), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $36, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $15, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%esi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%esi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1894,84 +1404,44 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %edx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, (%esp) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $15, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%edx), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%edx), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%edx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%ebp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%ebp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 4(%ebp)
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
@@ -1988,299 +1458,180 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %sil, %r9d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r13, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r9), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdx, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r14, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %al, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rax), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdx, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r9), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %r12, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %r13, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r11, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r11, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%r9)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 16(%r9)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%r9)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%r9)
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %sil, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rsi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r14, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r13, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r12, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rsi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r10,%r10), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r9, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %r9d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %r10, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rax, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %r8, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r9d, %r13d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r12, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r14, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r15, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %r9b, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r13d, %r15d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r15b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r15, %rbp, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r10, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %rbx, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%r9), %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r11d, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r12b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r14, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r11, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r11, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r11b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %r9b, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r13, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r15, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rcx, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %sil, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rcx), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -64(%rsp,%rcx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rcx), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rax, %rdi, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rax, %r11, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r8, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r13, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r11, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r12, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r13, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %sil, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r10d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r10b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r10, %rbx, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rdi, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X32-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
@@ -2289,476 +1640,127 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $140, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $88, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%ecx), %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl %ch, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-128, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %dl
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebx), %ebp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %bl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebx), %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edx), %edi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %ah, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, (%esp) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, (%esp) # 1-byte Folded Reload
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %al # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%eax)
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 12(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $140, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $88, %esp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -2771,366 +1773,101 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $120, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp), %esi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebp), %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebp), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-128, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    notb %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp), %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%ebp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%ebp), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bh, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%ebp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 24(%ebp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%ebp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%ebp)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bh, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%ebp)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bh, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bh, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $120, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -3143,421 +1880,101 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $152, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esi), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esi), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $84, %esp
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ecx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-128, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%ebx), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebx), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 20(%esp,%esi), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edi, %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edi, %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edi, %esi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%esi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%esi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%esi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%esi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%esi)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esi)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 28(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 16(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 20(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 12(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $152, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%esi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $84, %esp
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -3570,349 +1987,90 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $120, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $84, %esp
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-128, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %ecx
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edi, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edi), %eax
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edi), %esi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %esi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%edi), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebp,%ebp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%edi), %edx
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %esi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%edi), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%edx,%edx), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%edi), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, (%esp) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edi), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%edi), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ebx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ebx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%ebx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%ebx)
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ecx)
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%ebx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebx)
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $120, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $84, %esp
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -3927,306 +2085,188 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    negb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movsbq %sil, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%r10), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%r10), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r10d, %r10d
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -8(%rsp,%r10), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbp, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %al, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdx,%rdx), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%r10), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rax), %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdx, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %rbx, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %rbp, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r10, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r10, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, (%r9)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 8(%r9)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 24(%r9)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 16(%r9)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    negb %sil
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movsbq %sil, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rsi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r14, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r12, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r11, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r14, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rsi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r8, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r9, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %rax, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %r13d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r12, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r10, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r14, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r15, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %dil, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r13d, %r15d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r15b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r15, %rbp, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r9, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r10, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rdi), %ebx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r12b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r12, %r14, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %bl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %dil, %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r13, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r15, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rcx, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rcx, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    negb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movsbq %sil, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rcx), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rcx), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rcx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rcx), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r13, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %r8, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbx, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %r10, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r12, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r14, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %sil
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbq %sil, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rax), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r10d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r10b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %r10, %rbx, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rdi, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X32-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
@@ -4235,456 +2275,129 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $136, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $88, %esp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ecx), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movsbl %cl, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ecx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ecx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esi), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-128, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %bh # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bh, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edi), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %bh # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bh, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bh, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 28(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 20(%eax)
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 16(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 20(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $136, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $88, %esp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -4697,371 +2410,103 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $116, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebx), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %eax
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebx), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-128, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%ebx), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    notb %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%ebx), %esi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %esi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%ebx), %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%ebx), %ebp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%ebx), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%ebx), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, (%esp) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%ebx), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%ebx), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%ebx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%ebx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%ebx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%ebx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 24(%ebx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%ebx)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bh, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bh, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 4(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 28(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 16(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $116, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -5074,416 +2519,108 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $88, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ecx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-128, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %cl, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ecx), %ebx
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ecx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ecx), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ecx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb (%esp), %bl # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, 84(%esp,%ecx), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ecx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb (%esp), %dl # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 28(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 16(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 20(%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $88, %esp
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -5496,359 +2633,93 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $124, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebp), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebp), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ebp), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ebp), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $84, %esp
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-128, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %ecx
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%esi), %eax
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%esi), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%esi), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%esi), %edx
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $al killed $al killed $eax def $eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %edi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, (%esp) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%esi), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%esi), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%esi), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%esi), %esi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, (%esp) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %dl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb (%esp), %cl # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 24(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $124, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $84, %esp
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -5863,315 +2734,184 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %edx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %dl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rax, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %dl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r13, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %sil, %r9d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r12,%r12), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %dl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r14, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r9), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %dl
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %dl, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rdx), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r9), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %r13, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %rbp, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r12, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r11, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r11, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%r10)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 16(%r10)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%r10)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%r10)
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %sil, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rsi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r14, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r13, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r12, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rsi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r10,%r10), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r9, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %r9d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rcx, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rax, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarq $63, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %r9, %r10, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r12, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rbx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r9d, %ebp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bpl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbp, %r13, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r15, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %r14, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbp, %r15, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %r9b, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r12b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r12, %rbp, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r14, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rax, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rbp, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%r9), %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r11d, %r13d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r15, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r11, %r14, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %r11, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r11b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %r9b, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %rax, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r12, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rsi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %sil, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rcx), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -64(%rsp,%rcx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rcx), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rax, %rcx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rax, %rdi, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r12d, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rax, %r11, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r13, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r13, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r8, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %r11, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r13, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %rbp, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %sil, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r10d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r10b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r10, %rbx, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rdi, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X32-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
@@ -6180,478 +2920,130 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $144, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $88, %esp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ecx), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esi), %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esi), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl %ch, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-128, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebx), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebx), %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %dl
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bh, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %ah, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebx
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
 ; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $144, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $88, %esp
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -6664,354 +3056,104 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $116, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebx), %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esi), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esi), %ebx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebx), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebx), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebx), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esi), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esi), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebx), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-128, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    notb %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp), %ecx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %esi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %dl, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%ebp), %esi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%ebp), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%ebp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 24(%ebp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%ebp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%ebp)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%ebp)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $116, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -7024,434 +3166,105 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ebx, %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $84, %esp
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-128, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esi), %ecx
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ecx
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esi), %ecx
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ebx, %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esi), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esi), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esi), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esi), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esi), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %edi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 20(%esp,%esi), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 28(%edi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%edi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 20(%edi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edi)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edi)
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 28(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 16(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 20(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%edi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edi)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $84, %esp
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -7464,346 +3277,91 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $132, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $84, %esp
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %edx
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edi), %eax
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %eax
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-128, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %edx, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %esi, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edi), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %esi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%edi), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebp,%ebp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%edi), %edx
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, (%esp) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %esi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%edi), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%edx,%edx), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%edi), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, (%esp) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edi), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%edi), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ebx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ebx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%ebx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%ebx)
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ecx)
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%ebx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebx)
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $132, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $84, %esp
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index d1ad4192a6d2d..fbe635b67ec4c 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -627,239 +627,32 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, (%esp), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X32-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    andb $15, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movzbl (%esp,%ecx), %ecx
+; X32-NEXT:    movb %cl, (%eax)
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -944,240 +737,32 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movw %bp, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movw %bp, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %di, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X32-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    andb $15, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %ecx
+; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -1261,240 +846,32 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X32-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    andb $15, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -1578,355 +955,34 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $24, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, (%esp) # 4-byte Folded Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel (%esp), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $24, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $24, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $24, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $28, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $28, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $28, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $28, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X32-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    andb $15, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %edx
+; X32-NEXT:    movl 4(%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, 4(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -1941,341 +997,58 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r9, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb %dil, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %eax, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %edi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %al, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $8, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $8, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %cl, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    shll $3, %esi
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $3, %sil
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movzbl -64(%rsp,%rax), %eax
+; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $64, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movzbl (%esp,%ecx), %ecx
+; X32-NEXT:    movb %cl, (%eax)
+; X32-NEXT:    addl $64, %esp
+; X32-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -2291,337 +1064,58 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r9, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movw %di, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movw %ax, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %eax, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %edi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %ax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $8, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, (%esp) # 4-byte Folded Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $8, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %cx, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    shll $3, %esi
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $3, %sil
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movq -64(%rsp,%rax), %rax
+; X64-NEXT:    movw %ax, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $64, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %ecx
+; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    addl $64, %esp
+; X32-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -2636,337 +1130,58 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r9, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %eax, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %edi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $8, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, (%esp) # 4-byte Folded Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $8, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    shll $3, %esi
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $3, %sil
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movl -64(%rsp,%rax), %eax
+; X64-NEXT:    movl %eax, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $64, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    addl $64, %esp
+; X32-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -2981,471 +1196,60 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r9, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %rax, %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %rax, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $24, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, (%esp) # 4-byte Folded Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $24, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $24, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $24, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $32, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $24, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, (%esp) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $24, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    shll $3, %esi
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $3, %sil
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movq -64(%rsp,%rax), %rax
+; X64-NEXT:    movq %rax, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $64, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %edx
+; X32-NEXT:    movl 4(%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, 4(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    addl $64, %esp
+; X32-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -3460,589 +1264,70 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %rcx, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %rcx, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r9d, %r9d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r9, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %r9, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %r9, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rsi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %r9d, %r9d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %r9, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r9d, %r9d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r8, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %r9, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rsi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 12(%edx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%edx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%edx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $36, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $36, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    shll $3, %esi
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $3, %sil
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movq -64(%rsp,%rax), %rcx
+; X64-NEXT:    movq -56(%rsp,%rax), %rax
+; X64-NEXT:    movq %rax, 8(%rdx)
+; X64-NEXT:    movq %rcx, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    subl $64, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %edx
+; X32-NEXT:    movl 4(%esp,%ecx), %esi
+; X32-NEXT:    movl 8(%esp,%ecx), %edi
+; X32-NEXT:    movl 12(%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, 12(%eax)
+; X32-NEXT:    movl %edi, 8(%eax)
+; X32-NEXT:    movl %esi, 4(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    addl $64, %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -4057,679 +1342,84 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 }
 
 define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r10d, %r10d
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r14, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbl %r9d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovel %r8d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovael %r10d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovel %r8d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb %dil, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r9d, %r9d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r11, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r11, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %r8d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %r8d, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %r9d, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r8, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r14, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %r10d, %r9d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r10d, %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r11b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r11, %rbx, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %r9d, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r9, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %r9d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %r9d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %r9d, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %r11d, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %r8b, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $12, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(,%ecx,8), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %dl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(%ecx), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negl %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $12, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ecx,8), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%esi), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negl %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $12, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ebp), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, (%esp), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %dl, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%esi), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael (%esp), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $12, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm2
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ebx,8), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %dl, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%ebx), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %ebx # imm = 0x100
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %cl, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $63, %esi
+; X64-NEXT:    movzbl -128(%rsp,%rsi), %eax
+; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $128, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    movdqu 16(%edx), %xmm1
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl $63, %ecx
+; X32-NEXT:    movzbl (%esp,%ecx), %ecx
+; X32-NEXT:    movb %cl, (%eax)
+; X32-NEXT:    addl $128, %esp
+; X32-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -4745,674 +1435,84 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r10d, %r10d
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r14, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbl %r9d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovel %r8d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovael %r10d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovel %r8d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movw %di, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r9d, %r9d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r11, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r11, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %r8d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %r8d, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %r9d, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movw %ax, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r8, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r14, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %r10d, %r9d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r10d, %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r11b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r11, %rbx, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %r9d, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r9, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %r9d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %r9d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %r9d, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %r11d, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %r8w, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $16, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(,%ecx,8), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %dl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ebx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(%ebx), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negl %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael (%esp), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movw %bx, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $16, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ecx,8), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%edi), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negl %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %edi # imm = 0x100
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $12, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ebp), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, (%esp), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %dl, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%esi), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael (%esp), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $12, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm2
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ebx,8), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %dl, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%ebx), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %ebx # imm = 0x100
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %cx, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $63, %esi
+; X64-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NEXT:    movw %ax, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $128, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    movdqu 16(%edx), %xmm1
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl $63, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %ecx
+; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    addl $128, %esp
+; X32-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -5427,671 +1527,84 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r10d, %r10d
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r14, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbl %r9d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovel %r8d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovael %r10d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovel %r8d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r9d, %r9d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r11, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r11, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %r8d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %r8d, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %r9d, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r8, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r14, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %r10d, %r9d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r10d, %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r11b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r11, %rbx, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %r9d, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r9, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %r9d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %r9d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %r9d, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %r11d, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %r8d, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $12, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(,%ecx,8), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ebx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(%ecx), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negl %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $12, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ecx,8), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%edi), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negl %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %edi # imm = 0x100
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $12, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ebp), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, (%esp), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %dl, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%esi), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael (%esp), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $12, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm2
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ebx,8), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %dl, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%ebx), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %ebx # imm = 0x100
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $63, %esi
+; X64-NEXT:    movl -128(%rsp,%rsi), %eax
+; X64-NEXT:    movl %eax, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $128, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    movdqu 16(%edx), %xmm1
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl $63, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    addl $128, %esp
+; X32-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -6106,1030 +1619,86 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r10d, %r10d
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r14, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbq %r9, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %r10, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r10d, %r10d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rbx, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r11, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbq %r8, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r10, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r8, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rdi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r14, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r10d, %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r11b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r11, %rbx, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbq %r9, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %rdi, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r9, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rcx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbq %r10, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r11, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $68, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(%ebx), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    negl %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %al, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ebx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %ecx # imm = 0x100
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $68, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $72, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ecx,8), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $al killed $al killed $eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm4, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $-128, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negl %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %eax # imm = 0x100
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 4(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $72, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $80, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%eax), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negl %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %dl, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %al, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edi, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $eax def $eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %ecx # imm = 0x100
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $80, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $72, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ecx,8), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %cl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%eax), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %edi # imm = 0x100
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 4(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $72, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $63, %esi
+; X64-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NEXT:    movq %rax, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $128, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    movdqu 16(%edx), %xmm1
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl $63, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %edx
+; X32-NEXT:    movl 4(%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, 4(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    addl $128, %esp
+; X32-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -7144,1586 +1713,96 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r14, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbq %r14, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %rbx, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %rbx, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r10d, %r10d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r14, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rdi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbq %r11, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbq %r15, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r10, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r10, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r8d, %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r11b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r9, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r14d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r14, %r9, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r11, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r11b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r11, %r12, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %r12d, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r14d, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r14, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r14b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbq %rbx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbq %r9, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r12, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r12, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r8, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r9d, %r9d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbx, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r9, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rax, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r9, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r11, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r9, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbq %r14, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbq %r15, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r9, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r9, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $112, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(%eax), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    negl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ebx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(,%ecx,8), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %al, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %edx # imm = 0x100
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $112, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $88, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%esi,8), %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%ebp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negl %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(,%edx,8), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %cl, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %ecx # imm = 0x100
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $88, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $124, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%eax), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negl %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %dl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm3, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, (%esp) # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al def $eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %al, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $eax def $eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, (%esp) # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %edi # imm = 0x100
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $124, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%ecx,8), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%ecx), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %dl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(,%eax,8), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %cl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %cl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %ecx # imm = 0x100
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $63, %esi
+; X64-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NEXT:    movq -120(%rsp,%rsi), %rcx
+; X64-NEXT:    movq %rcx, 8(%rdx)
+; X64-NEXT:    movq %rax, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    subl $128, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    movdqu 16(%edx), %xmm1
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl $63, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %edx
+; X32-NEXT:    movl 4(%esp,%ecx), %esi
+; X32-NEXT:    movl 8(%esp,%ecx), %edi
+; X32-NEXT:    movl 12(%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, 12(%eax)
+; X32-NEXT:    movl %edi, 8(%eax)
+; X32-NEXT:    movl %esi, 4(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    addl $128, %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -8738,2219 +1817,116 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 }
 
 define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r13, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdx, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdx
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbq %rbx, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %r11, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbq %r13, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %r11, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r9, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %r11, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %r11, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %r11, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %r11, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r9, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdx, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 24(%r8)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 16(%r8)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, 8(%r8)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, (%r8)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbx, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r13, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbx, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbx, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbq %r12, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbq %r13, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %rbx, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %rbx, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %rbx, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %rbx, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %r9d, %r9d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rdi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r13d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r14, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %r10, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r14, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r15, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r13d, %r15d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r15b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r15, %rbp, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %rax, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r10, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r12d, %ebx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r14, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r12, %r10, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r12b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r12, %rax, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r14, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbq %r15, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r9, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r9, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r9, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r9, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r9, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %rdi, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rax, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r13, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbq %r12, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbq %r13, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r11, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r11, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r11, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r11, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $168, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm1
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(%edx), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    negl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %al, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %ebp # imm = 0x100
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 28(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 24(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $168, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $140, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%ecx,8), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %eax, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%ebx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negl %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(,%eax,8), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 12(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $140, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $192, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm1
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm4, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm4, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm4, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm3, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%edi), %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edi, %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %bl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, (%esp), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael (%esp), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %dh
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %bl, %dh
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %dh, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %dh, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %bl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ebx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ebx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %ecx # imm = 0x100
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 28(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 16(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $192, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $140, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm4, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm4, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%ecx,8), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm4, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %cl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel (%esp), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $-128, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(,%eax,8), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %dl, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %eax # imm = 0x100
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $140, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $63, %esi
+; X64-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NEXT:    movq -120(%rsp,%rsi), %rcx
+; X64-NEXT:    movq -112(%rsp,%rsi), %rdi
+; X64-NEXT:    movq -104(%rsp,%rsi), %rsi
+; X64-NEXT:    movq %rsi, 24(%rdx)
+; X64-NEXT:    movq %rdi, 16(%rdx)
+; X64-NEXT:    movq %rcx, 8(%rdx)
+; X64-NEXT:    movq %rax, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    subl $136, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movdqu (%ecx), %xmm0
+; X32-NEXT:    movdqu 16(%ecx), %xmm1
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl $63, %eax
+; X32-NEXT:    movl 8(%esp,%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 12(%esp,%eax), %ecx
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl 16(%esp,%eax), %esi
+; X32-NEXT:    movl 20(%esp,%eax), %edi
+; X32-NEXT:    movl 24(%esp,%eax), %ebx
+; X32-NEXT:    movl 28(%esp,%eax), %ebp
+; X32-NEXT:    movl 32(%esp,%eax), %edx
+; X32-NEXT:    movl 36(%esp,%eax), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %ecx, 28(%eax)
+; X32-NEXT:    movl %edx, 24(%eax)
+; X32-NEXT:    movl %ebp, 20(%eax)
+; X32-NEXT:    movl %ebx, 16(%eax)
+; X32-NEXT:    movl %edi, 12(%eax)
+; X32-NEXT:    movl %esi, 8(%eax)
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, 4(%eax)
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    addl $136, %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -10965,9 +1941,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X32: {{.*}}
 ; X32-NO-SHLD: {{.*}}
 ; X32-SHLD: {{.*}}
-; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
 ; X64-SHLD: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index ab07986f289b8..7732118a537dc 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -603,239 +603,32 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, (%esp), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X32-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    andb $15, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movzbl (%esp,%ecx), %ecx
+; X32-NEXT:    movb %cl, (%eax)
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <16 x i8> %init
@@ -918,240 +711,32 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movw %bp, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movw %bp, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %di, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X32-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    andb $15, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %ecx
+; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <16 x i8> %init
@@ -1233,240 +818,32 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X32-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    andb $15, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <16 x i8> %init
@@ -1548,355 +925,34 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $24, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, (%esp) # 4-byte Folded Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel (%esp), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $24, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $24, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $24, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $28, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $28, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $28, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $28, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X32-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    andb $15, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %edx
+; X32-NEXT:    movl 4(%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, 4(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <16 x i8> %init
@@ -1911,636 +967,64 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; no @load_16byte_chunk_of_16byte_alloca
 
 define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orl %r11d, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %r9d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb %r8b, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orl %r11d, %r8d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %r8d, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r9, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %r11d, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r9d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r9d, %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r11, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %r8d, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r9, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %r11d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %r11d, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %r8d, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %al, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $16, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm1
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael (%esp), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-128, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, (%esp) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %dl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $16, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm1
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-128, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $16, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm1
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %dl, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-128, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %al, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, (%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $16, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $12, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm1
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-128, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %cl, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $12, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NEXT:    shll $3, %esi
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $3, %sil
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movzbl -64(%rsp,%rax), %eax
+; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $64, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    movdqu 16(%edx), %xmm1
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movzbl (%esp,%ecx), %ecx
+; X32-NEXT:    movb %cl, (%eax)
+; X32-NEXT:    addl $64, %esp
+; X32-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -2554,635 +1038,64 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 }
 
 define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orl %r11d, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %r9d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movw %r8w, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orl %r11d, %r8d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %r8d, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movw %ax, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r9, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %r11d, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r9d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r9d, %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r11, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %r8d, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %ax, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r9, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %r11d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %r11d, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %r8d, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %ax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $16, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm1
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %dl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-128, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movw %bp, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $16, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm1
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-128, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movw %bp, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $16, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm1
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %dl, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-128, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %al, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movw %ax, (%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $16, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $12, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm1
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-128, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %cx, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $12, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NEXT:    shll $3, %esi
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $3, %sil
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movq -64(%rsp,%rax), %rax
+; X64-NEXT:    movw %ax, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $64, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    movdqu 16(%edx), %xmm1
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %ecx
+; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    addl $64, %esp
+; X32-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -3195,635 +1108,64 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 }
 
 define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orl %r11d, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %r9d, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orl %r11d, %r8d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %r8d, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r9, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %r11d, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r9d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r9d, %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r11, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %r8d, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r9, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %r11d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %r11d, %r8d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %r8d, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $16, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm1
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %dl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-128, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $16, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm1
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-128, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $16, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm1
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %dl, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-128, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %al, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $16, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $12, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm1
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-128, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $12, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NEXT:    shll $3, %esi
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $3, %sil
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movl -64(%rsp,%rax), %eax
+; X64-NEXT:    movl %eax, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $64, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    movdqu 16(%edx), %xmm1
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    addl $64, %esp
+; X32-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -3836,897 +1178,66 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 }
 
 define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %r9, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r11, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r8, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r9, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r9d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r9d, %r10d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r10b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r11, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r8, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r9, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rcx, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r11, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $64, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm2
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-128, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $64, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm1
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-128, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $64, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %dl, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-128, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl (%esp), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %bl, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl (%esp), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $64, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm1
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, (%esp), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-128, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %edx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NEXT:    shll $3, %esi
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $3, %sil
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movq -64(%rsp,%rax), %rax
+; X64-NEXT:    movq %rax, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $64, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    movdqu 16(%edx), %xmm1
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %edx
+; X32-NEXT:    movl 4(%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, 4(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    addl $64, %esp
+; X32-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -4739,1522 +1250,76 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 }
 
 define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r14, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %r14, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r9, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %r10, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r10d, %r10d
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r14, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rdi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %sil, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r15, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r11, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r8d, %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r11b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r9, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r14d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r14, %r9, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r11, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r11d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r11b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r11, %r12, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %r12d, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r14d, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r8b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r14, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r14b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %rbx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r9, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rax, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r10d, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbx, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %rdi, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r11, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r9, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r9, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %sil, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r14, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r15, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subl $128, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm1
-; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm0
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-128, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %edi, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 4(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    addl $128, %esp
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $120, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm4, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm4, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm4, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-128, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebx, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bh
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-64, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $120, %esp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $144, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm1
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm0
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm4, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm4, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-128, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm4, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm3, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $64, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %bl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edi, %eax, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %ebp, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%edx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%edx)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $144, %esp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $120, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm4, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm4, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm4, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ecx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-128, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %esi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel (%esp), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edi, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebx, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%ecx)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $120, %esp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NEXT:    shll $3, %esi
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $3, %sil
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movq -64(%rsp,%rax), %rcx
+; X64-NEXT:    movq -56(%rsp,%rax), %rax
+; X64-NEXT:    movq %rax, 8(%rdx)
+; X64-NEXT:    movq %rcx, (%rdx)
+; X64-NEXT:    retq
+;
+; X32-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    subl $64, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqu (%edx), %xmm0
+; X32-NEXT:    movdqu 16(%edx), %xmm1
+; X32-NEXT:    shll $3, %ecx
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; X32-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm0, (%esp)
+; X32-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
+; X32-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    shrb $3, %cl
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movl (%esp,%ecx), %edx
+; X32-NEXT:    movl 4(%esp,%ecx), %esi
+; X32-NEXT:    movl 8(%esp,%ecx), %edi
+; X32-NEXT:    movl 12(%esp,%ecx), %ecx
+; X32-NEXT:    movl %ecx, 12(%eax)
+; X32-NEXT:    movl %edi, 8(%eax)
+; X32-NEXT:    movl %esi, 4(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    addl $64, %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -6269,9 +1334,7 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; no @load_32byte_chunk_of_32byte_alloca
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X32: {{.*}}
 ; X32-NO-SHLD: {{.*}}
 ; X32-SHLD: {{.*}}
-; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
 ; X64-SHLD: {{.*}}


        


More information about the llvm-commits mailing list