[llvm] [Codegen][LegalizeIntegerTypes] Improve shift through stack (PR #96151)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 3 03:51:35 PDT 2024
https://github.com/futog updated https://github.com/llvm/llvm-project/pull/96151
>From 0b8dea8d16aaeb8c423adf965af5f605cadd201e Mon Sep 17 00:00:00 2001
From: Gergely Futo <gergely.futo at hightec-rt.com>
Date: Thu, 20 Jun 2024 10:08:16 +0200
Subject: [PATCH 1/4] [Codegen][LegalizeIntegerTypes] Improve shift through
stack
Minor improvement on cc39c3b17fb2598e20ca0854f9fe6d69169d85c7.
If the target does not support unaligned memory access, use native
register aligment instead of byte alignment. The shift amount is also
splitted based on the native aligment, so load happens only from aligned
addresses.
---
.../SelectionDAG/LegalizeIntegerTypes.cpp | 58 +-
llvm/test/CodeGen/RISCV/shifts.ll | 366 +-
...lar-shift-by-byte-multiple-legalization.ll | 3119 +++++++-------
.../RISCV/wide-scalar-shift-legalization.ll | 3581 +++++++----------
4 files changed, 3029 insertions(+), 4095 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a058b509b3aca..f21ed7581a5af 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4530,14 +4530,25 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
SDValue ShAmt = N->getOperand(1);
EVT ShAmtVT = ShAmt.getValueType();
- // This legalization is optimal when the shift is by a multiple of byte width,
- // %x * 8 <-> %x << 3 so 3 low bits should be be known zero.
- bool ShiftByByteMultiple =
- DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= 3;
+ EVT LoadStoreVT = VT;
+ do {
+ LoadStoreVT = TLI.getTypeToTransformTo(*DAG.getContext(), LoadStoreVT);
+ }while (!TLI.isTypeLegal(LoadStoreVT));
+
+ const Align LoadStoreAlign = [&]() -> Align {
+ if (TLI.allowsMisalignedMemoryAccesses(LoadStoreVT))
+ return Align(1);
+
+ return DAG.getReducedAlign(LoadStoreVT, /*UseABI=*/false);
+ }();
+
+ const unsigned ShiftUnitInBits = LoadStoreAlign.value() * 8;
+ const bool IsOneStepShift =
+ DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= Log2_32(ShiftUnitInBits);
// If we can't do it as one step, we'll have two uses of shift amount,
// and thus must freeze it.
- if (!ShiftByByteMultiple)
+ if (!IsOneStepShift)
ShAmt = DAG.getFreeze(ShAmt);
unsigned VTBitWidth = VT.getScalarSizeInBits();
@@ -4551,8 +4562,7 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
// Get a temporary stack slot 2x the width of our VT.
// FIXME: reuse stack slots?
- // FIXME: should we be more picky about alignment?
- Align StackSlotAlignment(1);
+ Align StackSlotAlignment(LoadStoreAlign);
SDValue StackPtr = DAG.CreateStackTemporary(
TypeSize::getFixed(StackSlotByteWidth), StackSlotAlignment);
EVT PtrTy = StackPtr.getValueType();
@@ -4577,16 +4587,22 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo, StackSlotAlignment);
// Now, compute the full-byte offset into stack slot from where we can load.
- // We have shift amount, which is in bits, but in multiples of byte.
- // So just divide by CHAR_BIT.
+ // We have shift amount, which is in bits. Offset should point to an aligned
+ // address.
SDNodeFlags Flags;
- if (ShiftByByteMultiple)
+ if (IsOneStepShift)
Flags.setExact(true);
- SDValue ByteOffset = DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt,
- DAG.getConstant(3, dl, ShAmtVT), Flags);
+ SDValue OffsetInBits = DAG.getNode(ISD::SHL, dl, ShAmtVT,
+ DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt, DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT), Flags),
+ DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT));
+ Flags.setExact(true);
+ SDValue Offset = DAG.getNode(
+ ISD::SRL, dl, ShAmtVT,
+ OffsetInBits,
+ DAG.getConstant(3, dl, ShAmtVT), Flags);
// And clamp it, because OOB load is an immediate UB,
// while shift overflow would have *just* been poison.
- ByteOffset = DAG.getNode(ISD::AND, dl, ShAmtVT, ByteOffset,
+ Offset = DAG.getNode(ISD::AND, dl, ShAmtVT, Offset,
DAG.getConstant(VTByteWidth - 1, dl, ShAmtVT));
// We have exactly two strategies on indexing into stack slot here:
// 1. upwards starting from the beginning of the slot
@@ -4603,23 +4619,23 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
} else {
AdjStackPtr = DAG.getMemBasePlusOffset(
StackPtr, DAG.getConstant(VTByteWidth, dl, PtrTy), dl);
- ByteOffset = DAG.getNegative(ByteOffset, dl, ShAmtVT);
+ Offset = DAG.getNegative(Offset, dl, ShAmtVT);
}
// Get the pointer somewhere into the stack slot from which we need to load.
- ByteOffset = DAG.getSExtOrTrunc(ByteOffset, dl, PtrTy);
- AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, ByteOffset, dl);
+ Offset = DAG.getSExtOrTrunc(Offset, dl, PtrTy);
+ AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, Offset, dl);
// And load it! While the load is not legal, legalizing it is obvious.
SDValue Res = DAG.getLoad(
VT, dl, Ch, AdjStackPtr,
- MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), Align(1));
- // We've performed the shift by a CHAR_BIT * [_ShAmt / CHAR_BIT_]
+ MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), LoadStoreAlign);
+ // We've performed the shift by a CHAR_BIT * [ShAmt / LoadAlign]
- // If we may still have a less-than-CHAR_BIT to shift by, do so now.
- if (!ShiftByByteMultiple) {
+ // If we may still have a remaining bits to shift by, do so now.
+ if (!IsOneStepShift) {
SDValue ShAmtRem = DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
- DAG.getConstant(7, dl, ShAmtVT));
+ DAG.getConstant(ShiftUnitInBits - 1, dl, ShAmtVT));
Res = DAG.getNode(N->getOpcode(), dl, VT, Res, ShAmtRem);
}
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index f61cbfd3ed725..5ba8755201ddf 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -157,106 +157,33 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a5, 8(a1)
; RV32I-NEXT: lw a1, 12(a1)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb zero, 27(sp)
-; RV32I-NEXT: sb zero, 26(sp)
-; RV32I-NEXT: sb zero, 25(sp)
-; RV32I-NEXT: sb zero, 24(sp)
-; RV32I-NEXT: sb zero, 23(sp)
-; RV32I-NEXT: sb zero, 22(sp)
-; RV32I-NEXT: sb zero, 21(sp)
-; RV32I-NEXT: sb zero, 20(sp)
-; RV32I-NEXT: sb zero, 19(sp)
-; RV32I-NEXT: sb zero, 18(sp)
-; RV32I-NEXT: sb zero, 17(sp)
-; RV32I-NEXT: sb zero, 16(sp)
-; RV32I-NEXT: sb a1, 12(sp)
-; RV32I-NEXT: sb a5, 8(sp)
-; RV32I-NEXT: sb a4, 4(sp)
-; RV32I-NEXT: sb a3, 0(sp)
-; RV32I-NEXT: srli a6, a1, 24
-; RV32I-NEXT: sb a6, 15(sp)
-; RV32I-NEXT: srli a6, a1, 16
-; RV32I-NEXT: sb a6, 14(sp)
-; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 13(sp)
-; RV32I-NEXT: srli a1, a5, 24
-; RV32I-NEXT: sb a1, 11(sp)
-; RV32I-NEXT: srli a1, a5, 16
-; RV32I-NEXT: sb a1, 10(sp)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 9(sp)
-; RV32I-NEXT: srli a1, a4, 24
-; RV32I-NEXT: sb a1, 7(sp)
-; RV32I-NEXT: srli a1, a4, 16
-; RV32I-NEXT: sb a1, 6(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 5(sp)
-; RV32I-NEXT: srli a1, a3, 24
-; RV32I-NEXT: sb a1, 3(sp)
-; RV32I-NEXT: srli a1, a3, 16
-; RV32I-NEXT: sb a1, 2(sp)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 1(sp)
-; RV32I-NEXT: slli a1, a2, 25
-; RV32I-NEXT: srli a1, a1, 28
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw a1, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a1, a2, 3
+; RV32I-NEXT: andi a1, a1, 12
; RV32I-NEXT: mv a3, sp
; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: lbu a3, 1(a1)
-; RV32I-NEXT: lbu a4, 0(a1)
-; RV32I-NEXT: lbu a5, 2(a1)
-; RV32I-NEXT: lbu a6, 3(a1)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: andi a2, a2, 7
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: srl a3, a3, a2
-; RV32I-NEXT: lbu a4, 5(a1)
-; RV32I-NEXT: lbu a5, 4(a1)
-; RV32I-NEXT: lbu a6, 6(a1)
-; RV32I-NEXT: lbu a7, 7(a1)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: or a4, a4, a5
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a5, a7, a6
-; RV32I-NEXT: or a4, a5, a4
; RV32I-NEXT: slli a5, a4, 1
-; RV32I-NEXT: xori a6, a2, 31
+; RV32I-NEXT: andi a6, a2, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 8(a1)
; RV32I-NEXT: sll a5, a5, a6
; RV32I-NEXT: or a3, a3, a5
; RV32I-NEXT: srl a4, a4, a2
-; RV32I-NEXT: lbu a5, 9(a1)
-; RV32I-NEXT: lbu a7, 8(a1)
-; RV32I-NEXT: lbu t0, 10(a1)
-; RV32I-NEXT: lbu t1, 11(a1)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: slli a7, a5, 1
-; RV32I-NEXT: not t0, a2
-; RV32I-NEXT: lbu t1, 13(a1)
-; RV32I-NEXT: sll a7, a7, t0
-; RV32I-NEXT: or a4, a4, a7
-; RV32I-NEXT: lbu a7, 12(a1)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: lbu t0, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
-; RV32I-NEXT: or a7, t1, a7
-; RV32I-NEXT: srl a5, a5, a2
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t0
-; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: slli a5, a7, 1
+; RV32I-NEXT: lw a1, 12(a1)
+; RV32I-NEXT: sll a5, a5, a6
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: srl a5, a7, a2
; RV32I-NEXT: slli a7, a1, 1
; RV32I-NEXT: sll a6, a7, a6
; RV32I-NEXT: or a5, a5, a6
@@ -299,110 +226,34 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw a4, 8(a1)
; RV32I-NEXT: lw a5, 4(a1)
; RV32I-NEXT: lw a1, 0(a1)
-; RV32I-NEXT: sb a3, 12(sp)
-; RV32I-NEXT: sb a4, 8(sp)
-; RV32I-NEXT: sb a5, 4(sp)
-; RV32I-NEXT: sb a1, 0(sp)
-; RV32I-NEXT: srai a6, a3, 31
-; RV32I-NEXT: sb a6, 28(sp)
-; RV32I-NEXT: sb a6, 24(sp)
-; RV32I-NEXT: sb a6, 20(sp)
-; RV32I-NEXT: sb a6, 16(sp)
-; RV32I-NEXT: srli a7, a3, 24
-; RV32I-NEXT: sb a7, 15(sp)
-; RV32I-NEXT: srli a7, a3, 16
-; RV32I-NEXT: sb a7, 14(sp)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 13(sp)
-; RV32I-NEXT: srli a3, a4, 24
-; RV32I-NEXT: sb a3, 11(sp)
-; RV32I-NEXT: srli a3, a4, 16
-; RV32I-NEXT: sb a3, 10(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 9(sp)
-; RV32I-NEXT: srli a3, a5, 24
-; RV32I-NEXT: sb a3, 7(sp)
-; RV32I-NEXT: srli a3, a5, 16
-; RV32I-NEXT: sb a3, 6(sp)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 5(sp)
-; RV32I-NEXT: srli a3, a1, 24
-; RV32I-NEXT: sb a3, 3(sp)
-; RV32I-NEXT: srli a3, a1, 16
-; RV32I-NEXT: sb a3, 2(sp)
-; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 1(sp)
-; RV32I-NEXT: srli a1, a6, 24
-; RV32I-NEXT: sb a1, 31(sp)
-; RV32I-NEXT: srli a3, a6, 16
-; RV32I-NEXT: sb a3, 30(sp)
-; RV32I-NEXT: srli a4, a6, 8
-; RV32I-NEXT: sb a4, 29(sp)
-; RV32I-NEXT: sb a1, 27(sp)
-; RV32I-NEXT: sb a3, 26(sp)
-; RV32I-NEXT: sb a4, 25(sp)
-; RV32I-NEXT: sb a1, 23(sp)
-; RV32I-NEXT: sb a3, 22(sp)
-; RV32I-NEXT: sb a4, 21(sp)
-; RV32I-NEXT: sb a1, 19(sp)
-; RV32I-NEXT: sb a3, 18(sp)
-; RV32I-NEXT: sb a4, 17(sp)
-; RV32I-NEXT: slli a1, a2, 25
-; RV32I-NEXT: srli a1, a1, 28
+; RV32I-NEXT: sw a3, 12(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a5, 4(sp)
+; RV32I-NEXT: sw a1, 0(sp)
+; RV32I-NEXT: srai a3, a3, 31
+; RV32I-NEXT: sw a3, 28(sp)
+; RV32I-NEXT: sw a3, 24(sp)
+; RV32I-NEXT: sw a3, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: srli a1, a2, 3
+; RV32I-NEXT: andi a1, a1, 12
; RV32I-NEXT: mv a3, sp
; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: lbu a3, 1(a1)
-; RV32I-NEXT: lbu a4, 0(a1)
-; RV32I-NEXT: lbu a5, 2(a1)
-; RV32I-NEXT: lbu a6, 3(a1)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: andi a2, a2, 7
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: srl a3, a3, a2
-; RV32I-NEXT: lbu a4, 5(a1)
-; RV32I-NEXT: lbu a5, 4(a1)
-; RV32I-NEXT: lbu a6, 6(a1)
-; RV32I-NEXT: lbu a7, 7(a1)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: or a4, a4, a5
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a5, a7, a6
-; RV32I-NEXT: or a4, a5, a4
; RV32I-NEXT: slli a5, a4, 1
-; RV32I-NEXT: xori a6, a2, 31
+; RV32I-NEXT: andi a6, a2, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 8(a1)
; RV32I-NEXT: sll a5, a5, a6
; RV32I-NEXT: or a3, a3, a5
; RV32I-NEXT: srl a4, a4, a2
-; RV32I-NEXT: lbu a5, 9(a1)
-; RV32I-NEXT: lbu a7, 8(a1)
-; RV32I-NEXT: lbu t0, 10(a1)
-; RV32I-NEXT: lbu t1, 11(a1)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: slli a7, a5, 1
-; RV32I-NEXT: not t0, a2
-; RV32I-NEXT: lbu t1, 13(a1)
-; RV32I-NEXT: sll a7, a7, t0
-; RV32I-NEXT: or a4, a4, a7
-; RV32I-NEXT: lbu a7, 12(a1)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: lbu t0, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
-; RV32I-NEXT: or a7, t1, a7
-; RV32I-NEXT: srl a5, a5, a2
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t0
-; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: slli a5, a7, 1
+; RV32I-NEXT: lw a1, 12(a1)
+; RV32I-NEXT: sll a5, a5, a6
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: srl a5, a7, a2
; RV32I-NEXT: slli a7, a1, 1
; RV32I-NEXT: sll a6, a7, a6
; RV32I-NEXT: or a5, a5, a6
@@ -445,114 +296,41 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a5, 8(a1)
; RV32I-NEXT: lw a1, 12(a1)
-; RV32I-NEXT: sb zero, 15(sp)
-; RV32I-NEXT: sb zero, 14(sp)
-; RV32I-NEXT: sb zero, 13(sp)
-; RV32I-NEXT: sb zero, 12(sp)
-; RV32I-NEXT: sb zero, 11(sp)
-; RV32I-NEXT: sb zero, 10(sp)
-; RV32I-NEXT: sb zero, 9(sp)
-; RV32I-NEXT: sb zero, 8(sp)
-; RV32I-NEXT: sb zero, 7(sp)
-; RV32I-NEXT: sb zero, 6(sp)
-; RV32I-NEXT: sb zero, 5(sp)
-; RV32I-NEXT: sb zero, 4(sp)
-; RV32I-NEXT: sb zero, 3(sp)
-; RV32I-NEXT: sb zero, 2(sp)
-; RV32I-NEXT: sb zero, 1(sp)
-; RV32I-NEXT: sb zero, 0(sp)
-; RV32I-NEXT: sb a1, 28(sp)
-; RV32I-NEXT: sb a5, 24(sp)
-; RV32I-NEXT: sb a4, 20(sp)
-; RV32I-NEXT: sb a3, 16(sp)
-; RV32I-NEXT: srli a6, a1, 24
-; RV32I-NEXT: sb a6, 31(sp)
-; RV32I-NEXT: srli a6, a1, 16
-; RV32I-NEXT: sb a6, 30(sp)
-; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 29(sp)
-; RV32I-NEXT: srli a1, a5, 24
-; RV32I-NEXT: sb a1, 27(sp)
-; RV32I-NEXT: srli a1, a5, 16
-; RV32I-NEXT: sb a1, 26(sp)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 25(sp)
-; RV32I-NEXT: srli a1, a4, 24
-; RV32I-NEXT: sb a1, 23(sp)
-; RV32I-NEXT: srli a1, a4, 16
-; RV32I-NEXT: sb a1, 22(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 21(sp)
-; RV32I-NEXT: srli a1, a3, 24
-; RV32I-NEXT: sb a1, 19(sp)
-; RV32I-NEXT: srli a1, a3, 16
-; RV32I-NEXT: sb a1, 18(sp)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 17(sp)
-; RV32I-NEXT: slli a1, a2, 25
-; RV32I-NEXT: srli a1, a1, 28
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a1, 28(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: srli a1, a2, 3
+; RV32I-NEXT: andi a1, a1, 12
; RV32I-NEXT: addi a3, sp, 16
-; RV32I-NEXT: sub a1, a3, a1
-; RV32I-NEXT: lbu a3, 5(a1)
-; RV32I-NEXT: lbu a4, 4(a1)
-; RV32I-NEXT: lbu a5, 6(a1)
-; RV32I-NEXT: lbu a6, 7(a1)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: andi a2, a2, 7
-; RV32I-NEXT: sll a4, a3, a2
-; RV32I-NEXT: lbu a5, 1(a1)
-; RV32I-NEXT: lbu a6, 0(a1)
-; RV32I-NEXT: lbu a7, 2(a1)
-; RV32I-NEXT: lbu t0, 3(a1)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a6
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: srli a6, a5, 1
-; RV32I-NEXT: xori a7, a2, 31
+; RV32I-NEXT: sub a3, a3, a1
+; RV32I-NEXT: lw a1, 4(a3)
+; RV32I-NEXT: lw a4, 0(a3)
+; RV32I-NEXT: sll a5, a1, a2
+; RV32I-NEXT: srli a6, a4, 1
+; RV32I-NEXT: andi a7, a2, 31
+; RV32I-NEXT: lw t0, 8(a3)
+; RV32I-NEXT: xori a7, a7, 31
; RV32I-NEXT: srl a6, a6, a7
-; RV32I-NEXT: or a4, a4, a6
-; RV32I-NEXT: lbu a6, 9(a1)
-; RV32I-NEXT: lbu t0, 8(a1)
-; RV32I-NEXT: lbu t1, 10(a1)
-; RV32I-NEXT: lbu t2, 11(a1)
-; RV32I-NEXT: slli a6, a6, 8
-; RV32I-NEXT: or a6, a6, t0
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or a6, t0, a6
-; RV32I-NEXT: sll t0, a6, a2
-; RV32I-NEXT: srli a3, a3, 1
-; RV32I-NEXT: not t1, a2
-; RV32I-NEXT: srl a3, a3, t1
-; RV32I-NEXT: or a3, t0, a3
-; RV32I-NEXT: lbu t0, 13(a1)
-; RV32I-NEXT: lbu t1, 12(a1)
-; RV32I-NEXT: lbu t2, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or t0, t0, t1
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t2
-; RV32I-NEXT: or a1, a1, t0
-; RV32I-NEXT: sll a1, a1, a2
-; RV32I-NEXT: srli a6, a6, 1
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: sll a6, t0, a2
+; RV32I-NEXT: lw a3, 12(a3)
+; RV32I-NEXT: srli a1, a1, 1
+; RV32I-NEXT: srl a1, a1, a7
+; RV32I-NEXT: or a1, a6, a1
+; RV32I-NEXT: sll a3, a3, a2
+; RV32I-NEXT: srli a6, t0, 1
; RV32I-NEXT: srl a6, a6, a7
-; RV32I-NEXT: or a1, a1, a6
-; RV32I-NEXT: sll a2, a5, a2
+; RV32I-NEXT: or a3, a3, a6
+; RV32I-NEXT: sll a2, a4, a2
; RV32I-NEXT: sw a2, 0(a0)
-; RV32I-NEXT: sw a1, 12(a0)
-; RV32I-NEXT: sw a3, 8(a0)
-; RV32I-NEXT: sw a4, 4(a0)
+; RV32I-NEXT: sw a3, 12(a0)
+; RV32I-NEXT: sw a1, 8(a0)
+; RV32I-NEXT: sw a5, 4(a0)
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index b0d435368e92b..0b87bb05cfd63 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -723,98 +723,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: lshr_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -48
-; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
; RV32I-NEXT: lbu a0, 15(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb zero, 35(sp)
-; RV32I-NEXT: sb zero, 34(sp)
-; RV32I-NEXT: sb zero, 33(sp)
-; RV32I-NEXT: sb zero, 32(sp)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb zero, 27(sp)
-; RV32I-NEXT: sb zero, 26(sp)
-; RV32I-NEXT: sb zero, 25(sp)
-; RV32I-NEXT: sb zero, 24(sp)
-; RV32I-NEXT: sb zero, 23(sp)
-; RV32I-NEXT: sb zero, 22(sp)
-; RV32I-NEXT: sb zero, 21(sp)
-; RV32I-NEXT: sb zero, 20(sp)
-; RV32I-NEXT: sb a0, 19(sp)
-; RV32I-NEXT: sb s2, 18(sp)
-; RV32I-NEXT: sb s1, 17(sp)
-; RV32I-NEXT: sb s0, 16(sp)
-; RV32I-NEXT: sb t6, 15(sp)
-; RV32I-NEXT: sb t5, 14(sp)
-; RV32I-NEXT: sb t4, 13(sp)
-; RV32I-NEXT: sb t3, 12(sp)
-; RV32I-NEXT: sb t2, 11(sp)
-; RV32I-NEXT: sb t1, 10(sp)
-; RV32I-NEXT: sb t0, 9(sp)
-; RV32I-NEXT: sb a7, 8(sp)
-; RV32I-NEXT: sb a6, 7(sp)
-; RV32I-NEXT: sb a5, 6(sp)
-; RV32I-NEXT: sb a4, 5(sp)
-; RV32I-NEXT: sb a3, 4(sp)
-; RV32I-NEXT: andi a1, a1, 15
-; RV32I-NEXT: addi a0, sp, 4
-; RV32I-NEXT: add a0, a0, a1
-; RV32I-NEXT: lbu a1, 5(a0)
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: lbu a4, 7(a0)
-; RV32I-NEXT: lbu a5, 6(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 12(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu t5, 14(a0)
-; RV32I-NEXT: lbu t6, 10(a0)
-; RV32I-NEXT: lbu s0, 11(a0)
-; RV32I-NEXT: lbu s1, 8(a0)
-; RV32I-NEXT: lbu a0, 9(a0)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb a0, 9(a2)
-; RV32I-NEXT: sb t5, 14(a2)
-; RV32I-NEXT: sb t4, 15(a2)
-; RV32I-NEXT: sb t3, 12(a2)
-; RV32I-NEXT: sb t2, 13(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: andi a0, a1, 12
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a0, a3, a0
+; RV32I-NEXT: lw a3, 4(a0)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: srl a4, a3, a1
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: andi a6, a1, 24
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 0(a0)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or t0, a4, t0
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: sll a3, a3, a6
+; RV32I-NEXT: or a3, a7, a3
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli t1, a0, 1
+; RV32I-NEXT: sll a6, t1, a6
+; RV32I-NEXT: or a6, a5, a6
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a6, 1(a2)
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: sb a4, 7(a2)
-; RV32I-NEXT: sb a3, 4(a2)
-; RV32I-NEXT: sb a1, 5(a2)
-; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 48
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 15(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: srli a0, a6, 16
+; RV32I-NEXT: sb a0, 10(a2)
+; RV32I-NEXT: srli a0, a6, 24
+; RV32I-NEXT: sb a0, 11(a2)
+; RV32I-NEXT: srli a0, a6, 8
+; RV32I-NEXT: sb a0, 9(a2)
+; RV32I-NEXT: srli a0, a3, 16
+; RV32I-NEXT: sb a0, 2(a2)
+; RV32I-NEXT: srli a0, a3, 24
+; RV32I-NEXT: sb a0, 3(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: srli a0, t0, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, t0, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a0, t0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -942,98 +961,117 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: shl_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -48
-; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
; RV32I-NEXT: lbu a0, 15(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb zero, 19(sp)
-; RV32I-NEXT: sb zero, 18(sp)
-; RV32I-NEXT: sb zero, 17(sp)
-; RV32I-NEXT: sb zero, 16(sp)
-; RV32I-NEXT: sb zero, 15(sp)
-; RV32I-NEXT: sb zero, 14(sp)
-; RV32I-NEXT: sb zero, 13(sp)
-; RV32I-NEXT: sb zero, 12(sp)
-; RV32I-NEXT: sb zero, 11(sp)
-; RV32I-NEXT: sb zero, 10(sp)
-; RV32I-NEXT: sb zero, 9(sp)
-; RV32I-NEXT: sb zero, 8(sp)
-; RV32I-NEXT: sb zero, 7(sp)
-; RV32I-NEXT: sb zero, 6(sp)
-; RV32I-NEXT: sb zero, 5(sp)
-; RV32I-NEXT: sb zero, 4(sp)
-; RV32I-NEXT: sb a0, 35(sp)
-; RV32I-NEXT: sb s2, 34(sp)
-; RV32I-NEXT: sb s1, 33(sp)
-; RV32I-NEXT: sb s0, 32(sp)
-; RV32I-NEXT: sb t6, 31(sp)
-; RV32I-NEXT: sb t5, 30(sp)
-; RV32I-NEXT: sb t4, 29(sp)
-; RV32I-NEXT: sb t3, 28(sp)
-; RV32I-NEXT: sb t2, 27(sp)
-; RV32I-NEXT: sb t1, 26(sp)
-; RV32I-NEXT: sb t0, 25(sp)
-; RV32I-NEXT: sb a7, 24(sp)
-; RV32I-NEXT: sb a6, 23(sp)
-; RV32I-NEXT: sb a5, 22(sp)
-; RV32I-NEXT: sb a4, 21(sp)
-; RV32I-NEXT: sb a3, 20(sp)
-; RV32I-NEXT: andi a1, a1, 15
-; RV32I-NEXT: addi a0, sp, 20
-; RV32I-NEXT: sub a0, a0, a1
-; RV32I-NEXT: lbu a1, 5(a0)
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: lbu a4, 7(a0)
-; RV32I-NEXT: lbu a5, 6(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 12(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu t5, 14(a0)
-; RV32I-NEXT: lbu t6, 10(a0)
-; RV32I-NEXT: lbu s0, 11(a0)
-; RV32I-NEXT: lbu s1, 8(a0)
-; RV32I-NEXT: lbu a0, 9(a0)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb s1, 8(a2)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: andi a0, a1, 12
+; RV32I-NEXT: addi a3, sp, 16
+; RV32I-NEXT: sub a3, a3, a0
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: lw a4, 0(a3)
+; RV32I-NEXT: sll a5, a0, a1
+; RV32I-NEXT: andi a6, a1, 24
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: srli a7, a4, 1
+; RV32I-NEXT: lw t0, 12(a3)
+; RV32I-NEXT: lw a3, 8(a3)
+; RV32I-NEXT: srl a7, a7, a6
+; RV32I-NEXT: or a7, a5, a7
+; RV32I-NEXT: sll t0, t0, a1
+; RV32I-NEXT: srli t1, a3, 1
+; RV32I-NEXT: srl t1, t1, a6
+; RV32I-NEXT: or t1, t0, t1
+; RV32I-NEXT: sll a3, a3, a1
+; RV32I-NEXT: srli a0, a0, 1
+; RV32I-NEXT: srl a0, a0, a6
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: sll a1, a4, a1
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: srli a3, a3, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, t0, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a5, a5, 24
+; RV32I-NEXT: sb a5, 7(a2)
+; RV32I-NEXT: sb a0, 8(a2)
+; RV32I-NEXT: sb t1, 12(a2)
+; RV32I-NEXT: sb a7, 4(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 10(a2)
+; RV32I-NEXT: srli a0, a0, 8
; RV32I-NEXT: sb a0, 9(a2)
-; RV32I-NEXT: sb t5, 14(a2)
-; RV32I-NEXT: sb t4, 15(a2)
-; RV32I-NEXT: sb t3, 12(a2)
-; RV32I-NEXT: sb t2, 13(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
-; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a6, 1(a2)
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: sb a4, 7(a2)
-; RV32I-NEXT: sb a3, 4(a2)
-; RV32I-NEXT: sb a1, 5(a2)
-; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 48
+; RV32I-NEXT: srli a0, t1, 16
+; RV32I-NEXT: sb a0, 14(a2)
+; RV32I-NEXT: srli a0, t1, 8
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: srli a0, a7, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, a7, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -1161,105 +1199,118 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: ashr_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -48
-; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 15(a0)
-; RV32I-NEXT: slli a4, a3, 24
-; RV32I-NEXT: lbu a5, 0(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 2(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 4(a0)
-; RV32I-NEXT: lbu t2, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t4, 7(a0)
-; RV32I-NEXT: lbu t5, 8(a0)
-; RV32I-NEXT: lbu t6, 9(a0)
-; RV32I-NEXT: lbu s0, 10(a0)
-; RV32I-NEXT: lbu s1, 11(a0)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 14(a0)
-; RV32I-NEXT: lbu a0, 13(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb a3, 15(sp)
-; RV32I-NEXT: sb s3, 14(sp)
-; RV32I-NEXT: sb a0, 13(sp)
-; RV32I-NEXT: sb s2, 12(sp)
-; RV32I-NEXT: sb s1, 11(sp)
-; RV32I-NEXT: sb s0, 10(sp)
-; RV32I-NEXT: sb t6, 9(sp)
-; RV32I-NEXT: sb t5, 8(sp)
-; RV32I-NEXT: sb t4, 7(sp)
-; RV32I-NEXT: sb t3, 6(sp)
-; RV32I-NEXT: sb t2, 5(sp)
-; RV32I-NEXT: sb t1, 4(sp)
-; RV32I-NEXT: sb t0, 3(sp)
-; RV32I-NEXT: sb a7, 2(sp)
-; RV32I-NEXT: sb a6, 1(sp)
-; RV32I-NEXT: sb a5, 0(sp)
-; RV32I-NEXT: srai a4, a4, 31
-; RV32I-NEXT: sb a4, 28(sp)
-; RV32I-NEXT: sb a4, 24(sp)
-; RV32I-NEXT: sb a4, 20(sp)
-; RV32I-NEXT: sb a4, 16(sp)
-; RV32I-NEXT: srli a0, a4, 24
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: srli a3, a4, 16
-; RV32I-NEXT: sb a3, 30(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 29(sp)
-; RV32I-NEXT: sb a0, 27(sp)
-; RV32I-NEXT: sb a3, 26(sp)
-; RV32I-NEXT: sb a4, 25(sp)
-; RV32I-NEXT: sb a0, 23(sp)
-; RV32I-NEXT: sb a3, 22(sp)
-; RV32I-NEXT: sb a4, 21(sp)
-; RV32I-NEXT: sb a0, 19(sp)
-; RV32I-NEXT: sb a3, 18(sp)
-; RV32I-NEXT: sb a4, 17(sp)
-; RV32I-NEXT: andi a1, a1, 15
-; RV32I-NEXT: mv a0, sp
-; RV32I-NEXT: add a0, a0, a1
-; RV32I-NEXT: lbu a1, 5(a0)
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: lbu a4, 7(a0)
-; RV32I-NEXT: lbu a5, 6(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 12(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu t5, 14(a0)
-; RV32I-NEXT: lbu t6, 10(a0)
-; RV32I-NEXT: lbu s0, 11(a0)
-; RV32I-NEXT: lbu s1, 8(a0)
-; RV32I-NEXT: lbu a0, 9(a0)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb a0, 9(a2)
-; RV32I-NEXT: sb t5, 14(a2)
-; RV32I-NEXT: sb t4, 15(a2)
-; RV32I-NEXT: sb t3, 12(a2)
-; RV32I-NEXT: sb t2, 13(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a7, a0, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t1, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: andi a0, a1, 12
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a0, a3, a0
+; RV32I-NEXT: lw a3, 4(a0)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: srl a4, a3, a1
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: andi a6, a1, 24
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 0(a0)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or t0, a4, t0
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: sll a3, a3, a6
+; RV32I-NEXT: or a3, a7, a3
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli t1, a0, 1
+; RV32I-NEXT: sll a6, t1, a6
+; RV32I-NEXT: or a6, a5, a6
+; RV32I-NEXT: sra a0, a0, a1
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a6, 1(a2)
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: sb a4, 7(a2)
-; RV32I-NEXT: sb a3, 4(a2)
-; RV32I-NEXT: sb a1, 5(a2)
-; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 48
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 15(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: srli a0, a6, 16
+; RV32I-NEXT: sb a0, 10(a2)
+; RV32I-NEXT: srli a0, a6, 24
+; RV32I-NEXT: sb a0, 11(a2)
+; RV32I-NEXT: srli a0, a6, 8
+; RV32I-NEXT: sb a0, 9(a2)
+; RV32I-NEXT: srli a0, a3, 16
+; RV32I-NEXT: sb a0, 2(a2)
+; RV32I-NEXT: srli a0, a3, 24
+; RV32I-NEXT: sb a0, 3(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: srli a0, t0, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, t0, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a0, t0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -1272,441 +1323,429 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: lshr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -64
; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: lbu s11, 23(a0)
-; RV64I-NEXT: lbu ra, 24(a0)
-; RV64I-NEXT: lbu t0, 25(a0)
-; RV64I-NEXT: lbu a7, 26(a0)
-; RV64I-NEXT: lbu a6, 27(a0)
-; RV64I-NEXT: lbu a5, 28(a0)
-; RV64I-NEXT: lbu a3, 31(a0)
-; RV64I-NEXT: lbu a4, 30(a0)
-; RV64I-NEXT: lbu a0, 29(a0)
-; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: sb a3, 87(sp)
-; RV64I-NEXT: sb a4, 86(sp)
-; RV64I-NEXT: sb a0, 85(sp)
-; RV64I-NEXT: sb a5, 84(sp)
-; RV64I-NEXT: sb a6, 83(sp)
-; RV64I-NEXT: sb a7, 82(sp)
-; RV64I-NEXT: sb zero, 119(sp)
-; RV64I-NEXT: sb zero, 118(sp)
-; RV64I-NEXT: sb zero, 117(sp)
-; RV64I-NEXT: sb zero, 116(sp)
-; RV64I-NEXT: sb zero, 115(sp)
-; RV64I-NEXT: sb zero, 114(sp)
-; RV64I-NEXT: sb zero, 113(sp)
-; RV64I-NEXT: sb zero, 112(sp)
-; RV64I-NEXT: sb zero, 111(sp)
-; RV64I-NEXT: sb zero, 110(sp)
-; RV64I-NEXT: sb zero, 109(sp)
-; RV64I-NEXT: sb zero, 108(sp)
-; RV64I-NEXT: sb zero, 107(sp)
-; RV64I-NEXT: sb zero, 106(sp)
-; RV64I-NEXT: sb zero, 105(sp)
-; RV64I-NEXT: sb zero, 104(sp)
-; RV64I-NEXT: sb zero, 103(sp)
-; RV64I-NEXT: sb zero, 102(sp)
-; RV64I-NEXT: sb zero, 101(sp)
-; RV64I-NEXT: sb zero, 100(sp)
-; RV64I-NEXT: sb zero, 99(sp)
-; RV64I-NEXT: sb zero, 98(sp)
-; RV64I-NEXT: sb zero, 97(sp)
-; RV64I-NEXT: sb zero, 96(sp)
-; RV64I-NEXT: sb zero, 95(sp)
-; RV64I-NEXT: sb zero, 94(sp)
-; RV64I-NEXT: sb zero, 93(sp)
-; RV64I-NEXT: sb zero, 92(sp)
-; RV64I-NEXT: sb zero, 91(sp)
-; RV64I-NEXT: sb zero, 90(sp)
-; RV64I-NEXT: sb zero, 89(sp)
-; RV64I-NEXT: sb zero, 88(sp)
-; RV64I-NEXT: sb t0, 81(sp)
-; RV64I-NEXT: sb ra, 80(sp)
-; RV64I-NEXT: sb s11, 79(sp)
-; RV64I-NEXT: sb s10, 78(sp)
-; RV64I-NEXT: sb s9, 77(sp)
-; RV64I-NEXT: sb s8, 76(sp)
-; RV64I-NEXT: sb s7, 75(sp)
-; RV64I-NEXT: sb s6, 74(sp)
-; RV64I-NEXT: sb s5, 73(sp)
-; RV64I-NEXT: sb s4, 72(sp)
-; RV64I-NEXT: sb s3, 71(sp)
-; RV64I-NEXT: sb s2, 70(sp)
-; RV64I-NEXT: sb s1, 69(sp)
-; RV64I-NEXT: sb s0, 68(sp)
-; RV64I-NEXT: sb t6, 67(sp)
-; RV64I-NEXT: sb t5, 66(sp)
-; RV64I-NEXT: sb t4, 65(sp)
-; RV64I-NEXT: sb t3, 64(sp)
-; RV64I-NEXT: sb t2, 63(sp)
-; RV64I-NEXT: sb t1, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: andi a1, a1, 31
-; RV64I-NEXT: addi a0, sp, 56
-; RV64I-NEXT: add a6, a0, a1
-; RV64I-NEXT: lbu a0, 8(a6)
-; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 9(a6)
-; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 10(a6)
-; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 11(a6)
-; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 12(a6)
-; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a7, 13(a6)
-; RV64I-NEXT: lbu t0, 14(a6)
-; RV64I-NEXT: lbu t1, 15(a6)
-; RV64I-NEXT: lbu t2, 0(a6)
-; RV64I-NEXT: lbu t3, 1(a6)
-; RV64I-NEXT: lbu t4, 2(a6)
-; RV64I-NEXT: lbu t5, 3(a6)
-; RV64I-NEXT: lbu t6, 4(a6)
-; RV64I-NEXT: lbu s0, 5(a6)
-; RV64I-NEXT: lbu s1, 6(a6)
-; RV64I-NEXT: lbu s2, 7(a6)
-; RV64I-NEXT: lbu s3, 24(a6)
-; RV64I-NEXT: lbu s4, 25(a6)
-; RV64I-NEXT: lbu s5, 26(a6)
-; RV64I-NEXT: lbu s6, 27(a6)
-; RV64I-NEXT: lbu s7, 28(a6)
-; RV64I-NEXT: lbu s8, 29(a6)
-; RV64I-NEXT: lbu s9, 30(a6)
-; RV64I-NEXT: lbu s10, 31(a6)
-; RV64I-NEXT: lbu s11, 16(a6)
-; RV64I-NEXT: lbu ra, 17(a6)
-; RV64I-NEXT: lbu a5, 18(a6)
-; RV64I-NEXT: lbu a4, 19(a6)
-; RV64I-NEXT: lbu a0, 23(a6)
-; RV64I-NEXT: lbu a1, 22(a6)
-; RV64I-NEXT: lbu a3, 21(a6)
-; RV64I-NEXT: lbu a6, 20(a6)
-; RV64I-NEXT: sb a0, 23(a2)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 56(sp)
+; RV64I-NEXT: sd zero, 48(sp)
+; RV64I-NEXT: sd zero, 40(sp)
+; RV64I-NEXT: sd zero, 32(sp)
+; RV64I-NEXT: sd a0, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: andi a0, a1, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: srl a5, a4, a1
+; RV64I-NEXT: ld a6, 16(a3)
+; RV64I-NEXT: andi a0, a1, 56
+; RV64I-NEXT: xori a7, a0, 63
+; RV64I-NEXT: ld t0, 0(a3)
+; RV64I-NEXT: slli a0, a6, 1
+; RV64I-NEXT: sll a0, a0, a7
+; RV64I-NEXT: or a0, a5, a0
+; RV64I-NEXT: srl t0, t0, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a7
+; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: srl a6, a6, a1
+; RV64I-NEXT: slli t1, a3, 1
+; RV64I-NEXT: sll a7, t1, a7
+; RV64I-NEXT: or a7, a6, a7
+; RV64I-NEXT: srl a1, a3, a1
+; RV64I-NEXT: sb a6, 16(a2)
+; RV64I-NEXT: sb a1, 24(a2)
+; RV64I-NEXT: sb t0, 0(a2)
+; RV64I-NEXT: sb a5, 8(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 30(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 28(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 27(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 26(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: srli a1, a7, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a7, 48
; RV64I-NEXT: sb a1, 22(a2)
-; RV64I-NEXT: sb a3, 21(a2)
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: sb a5, 18(a2)
-; RV64I-NEXT: sb ra, 17(a2)
-; RV64I-NEXT: sb s11, 16(a2)
-; RV64I-NEXT: sb s10, 31(a2)
-; RV64I-NEXT: sb s9, 30(a2)
-; RV64I-NEXT: sb s8, 29(a2)
-; RV64I-NEXT: sb s7, 28(a2)
-; RV64I-NEXT: sb s6, 27(a2)
-; RV64I-NEXT: sb s5, 26(a2)
-; RV64I-NEXT: sb s4, 25(a2)
-; RV64I-NEXT: sb s3, 24(a2)
-; RV64I-NEXT: sb s2, 7(a2)
-; RV64I-NEXT: sb s1, 6(a2)
-; RV64I-NEXT: sb s0, 5(a2)
-; RV64I-NEXT: sb t6, 4(a2)
-; RV64I-NEXT: sb t5, 3(a2)
-; RV64I-NEXT: sb t4, 2(a2)
-; RV64I-NEXT: sb t3, 1(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t1, 15(a2)
-; RV64I-NEXT: sb t0, 14(a2)
-; RV64I-NEXT: sb a7, 13(a2)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 12(a2)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 11(a2)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 10(a2)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: srli a1, a7, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a7, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a7, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a7, 16
+; RV64I-NEXT: sb a1, 18(a2)
+; RV64I-NEXT: srli a1, a7, 8
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 1(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
; RV64I-NEXT: sb a0, 9(a2)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
; RV32I-LABEL: lshr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -80
+; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s10, 22(a0)
-; RV32I-NEXT: lbu s11, 23(a0)
-; RV32I-NEXT: lbu ra, 24(a0)
-; RV32I-NEXT: lbu t0, 25(a0)
-; RV32I-NEXT: lbu a7, 26(a0)
-; RV32I-NEXT: lbu a6, 27(a0)
-; RV32I-NEXT: lbu a5, 28(a0)
-; RV32I-NEXT: lbu a3, 31(a0)
-; RV32I-NEXT: lbu a4, 30(a0)
-; RV32I-NEXT: lbu a0, 29(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb a3, 59(sp)
-; RV32I-NEXT: sb a4, 58(sp)
-; RV32I-NEXT: sb a0, 57(sp)
-; RV32I-NEXT: sb a5, 56(sp)
-; RV32I-NEXT: sb a6, 55(sp)
-; RV32I-NEXT: sb a7, 54(sp)
-; RV32I-NEXT: sb zero, 91(sp)
-; RV32I-NEXT: sb zero, 90(sp)
-; RV32I-NEXT: sb zero, 89(sp)
-; RV32I-NEXT: sb zero, 88(sp)
-; RV32I-NEXT: sb zero, 87(sp)
-; RV32I-NEXT: sb zero, 86(sp)
-; RV32I-NEXT: sb zero, 85(sp)
-; RV32I-NEXT: sb zero, 84(sp)
-; RV32I-NEXT: sb zero, 83(sp)
-; RV32I-NEXT: sb zero, 82(sp)
-; RV32I-NEXT: sb zero, 81(sp)
-; RV32I-NEXT: sb zero, 80(sp)
-; RV32I-NEXT: sb zero, 79(sp)
-; RV32I-NEXT: sb zero, 78(sp)
-; RV32I-NEXT: sb zero, 77(sp)
-; RV32I-NEXT: sb zero, 76(sp)
-; RV32I-NEXT: sb zero, 75(sp)
-; RV32I-NEXT: sb zero, 74(sp)
-; RV32I-NEXT: sb zero, 73(sp)
-; RV32I-NEXT: sb zero, 72(sp)
-; RV32I-NEXT: sb zero, 71(sp)
-; RV32I-NEXT: sb zero, 70(sp)
-; RV32I-NEXT: sb zero, 69(sp)
-; RV32I-NEXT: sb zero, 68(sp)
-; RV32I-NEXT: sb zero, 67(sp)
-; RV32I-NEXT: sb zero, 66(sp)
-; RV32I-NEXT: sb zero, 65(sp)
-; RV32I-NEXT: sb zero, 64(sp)
-; RV32I-NEXT: sb zero, 63(sp)
-; RV32I-NEXT: sb zero, 62(sp)
-; RV32I-NEXT: sb zero, 61(sp)
-; RV32I-NEXT: sb zero, 60(sp)
-; RV32I-NEXT: sb t0, 53(sp)
-; RV32I-NEXT: sb ra, 52(sp)
-; RV32I-NEXT: sb s11, 51(sp)
-; RV32I-NEXT: sb s10, 50(sp)
-; RV32I-NEXT: sb s9, 49(sp)
-; RV32I-NEXT: sb s8, 48(sp)
-; RV32I-NEXT: sb s7, 47(sp)
-; RV32I-NEXT: sb s6, 46(sp)
-; RV32I-NEXT: sb s5, 45(sp)
-; RV32I-NEXT: sb s4, 44(sp)
-; RV32I-NEXT: sb s3, 43(sp)
-; RV32I-NEXT: sb s2, 42(sp)
-; RV32I-NEXT: sb s1, 41(sp)
-; RV32I-NEXT: sb s0, 40(sp)
-; RV32I-NEXT: sb t6, 39(sp)
-; RV32I-NEXT: sb t5, 38(sp)
-; RV32I-NEXT: sb t4, 37(sp)
-; RV32I-NEXT: sb t3, 36(sp)
-; RV32I-NEXT: sb t2, 35(sp)
-; RV32I-NEXT: sb t1, 34(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 33(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 32(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 30(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 29(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 28(sp)
-; RV32I-NEXT: andi a1, a1, 31
-; RV32I-NEXT: addi a0, sp, 28
-; RV32I-NEXT: add a6, a0, a1
-; RV32I-NEXT: lbu a0, 6(a6)
-; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 7(a6)
-; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 4(a6)
-; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 5(a6)
-; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 0(a6)
-; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a7, 1(a6)
-; RV32I-NEXT: lbu t0, 2(a6)
-; RV32I-NEXT: lbu t1, 3(a6)
-; RV32I-NEXT: lbu t2, 14(a6)
-; RV32I-NEXT: lbu t3, 15(a6)
-; RV32I-NEXT: lbu t4, 12(a6)
-; RV32I-NEXT: lbu t5, 13(a6)
-; RV32I-NEXT: lbu t6, 10(a6)
-; RV32I-NEXT: lbu s0, 11(a6)
-; RV32I-NEXT: lbu s1, 8(a6)
-; RV32I-NEXT: lbu s2, 9(a6)
-; RV32I-NEXT: lbu s3, 22(a6)
-; RV32I-NEXT: lbu s4, 23(a6)
-; RV32I-NEXT: lbu s5, 20(a6)
-; RV32I-NEXT: lbu s6, 21(a6)
-; RV32I-NEXT: lbu s7, 18(a6)
-; RV32I-NEXT: lbu s8, 19(a6)
-; RV32I-NEXT: lbu s9, 16(a6)
-; RV32I-NEXT: lbu s10, 17(a6)
-; RV32I-NEXT: lbu s11, 30(a6)
-; RV32I-NEXT: lbu ra, 31(a6)
-; RV32I-NEXT: lbu a5, 28(a6)
-; RV32I-NEXT: lbu a4, 29(a6)
-; RV32I-NEXT: lbu a0, 25(a6)
-; RV32I-NEXT: lbu a1, 24(a6)
-; RV32I-NEXT: lbu a3, 27(a6)
-; RV32I-NEXT: lbu a6, 26(a6)
-; RV32I-NEXT: sb a0, 25(a2)
-; RV32I-NEXT: sb a1, 24(a2)
-; RV32I-NEXT: sb a3, 27(a2)
-; RV32I-NEXT: sb a6, 26(a2)
-; RV32I-NEXT: sb a4, 29(a2)
-; RV32I-NEXT: sb a5, 28(a2)
-; RV32I-NEXT: sb ra, 31(a2)
-; RV32I-NEXT: sb s11, 30(a2)
-; RV32I-NEXT: sb s10, 17(a2)
-; RV32I-NEXT: sb s9, 16(a2)
-; RV32I-NEXT: sb s8, 19(a2)
-; RV32I-NEXT: sb s7, 18(a2)
-; RV32I-NEXT: sb s6, 21(a2)
-; RV32I-NEXT: sb s5, 20(a2)
-; RV32I-NEXT: sb s4, 23(a2)
-; RV32I-NEXT: sb s3, 22(a2)
-; RV32I-NEXT: sb s2, 9(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb t5, 13(a2)
-; RV32I-NEXT: sb t4, 12(a2)
-; RV32I-NEXT: sb t3, 15(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t1, 3(a2)
-; RV32I-NEXT: sb t0, 2(a2)
-; RV32I-NEXT: sb a7, 1(a2)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 0(a2)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 6(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
-; RV32I-NEXT: ret
- %src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu t2, 1(a1)
+; RV32I-NEXT: lbu t3, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: or a1, a1, t2
+; RV32I-NEXT: sw zero, 64(sp)
+; RV32I-NEXT: sw zero, 60(sp)
+; RV32I-NEXT: sw zero, 56(sp)
+; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 44(sp)
+; RV32I-NEXT: sw zero, 40(sp)
+; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw t1, 28(sp)
+; RV32I-NEXT: sw t0, 24(sp)
+; RV32I-NEXT: sw a7, 20(sp)
+; RV32I-NEXT: sw a6, 16(sp)
+; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a3, 4(sp)
+; RV32I-NEXT: andi a0, a1, 28
+; RV32I-NEXT: addi a3, sp, 4
+; RV32I-NEXT: add a5, a3, a0
+; RV32I-NEXT: lw a3, 4(a5)
+; RV32I-NEXT: slli a6, a1, 3
+; RV32I-NEXT: srl a4, a3, a6
+; RV32I-NEXT: lw a7, 8(a5)
+; RV32I-NEXT: andi a0, a6, 24
+; RV32I-NEXT: xori t0, a0, 31
+; RV32I-NEXT: lw a1, 0(a5)
+; RV32I-NEXT: slli a0, a7, 1
+; RV32I-NEXT: sll a0, a0, t0
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: srl t1, a1, a6
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw t2, 12(a5)
+; RV32I-NEXT: lw t3, 16(a5)
+; RV32I-NEXT: sll a1, a3, t0
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: srl t4, t2, a6
+; RV32I-NEXT: slli a3, t3, 1
+; RV32I-NEXT: sll a3, a3, t0
+; RV32I-NEXT: or a3, t4, a3
+; RV32I-NEXT: srl a7, a7, a6
+; RV32I-NEXT: slli t2, t2, 1
+; RV32I-NEXT: lw t5, 20(a5)
+; RV32I-NEXT: lw t6, 24(a5)
+; RV32I-NEXT: sll t2, t2, t0
+; RV32I-NEXT: or t2, a7, t2
+; RV32I-NEXT: srl s0, t5, a6
+; RV32I-NEXT: slli s1, t6, 1
+; RV32I-NEXT: sll s1, s1, t0
+; RV32I-NEXT: or s1, s0, s1
+; RV32I-NEXT: srl t3, t3, a6
+; RV32I-NEXT: slli t5, t5, 1
+; RV32I-NEXT: lw a5, 28(a5)
+; RV32I-NEXT: sll t5, t5, t0
+; RV32I-NEXT: or t5, t3, t5
+; RV32I-NEXT: srl t6, t6, a6
+; RV32I-NEXT: slli s2, a5, 1
+; RV32I-NEXT: sll t0, s2, t0
+; RV32I-NEXT: or t0, t6, t0
+; RV32I-NEXT: srl a5, a5, a6
+; RV32I-NEXT: sb t6, 24(a2)
+; RV32I-NEXT: sb a5, 28(a2)
+; RV32I-NEXT: sb t3, 16(a2)
+; RV32I-NEXT: sb s0, 20(a2)
+; RV32I-NEXT: sb a7, 8(a2)
+; RV32I-NEXT: sb t4, 12(a2)
+; RV32I-NEXT: sb t1, 0(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a4, a5, 24
+; RV32I-NEXT: sb a4, 31(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 30(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 29(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 27(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, t5, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t5, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t5, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, s1, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, s1, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli s1, s1, 8
+; RV32I-NEXT: sb s1, 21(a2)
+; RV32I-NEXT: srli a4, t2, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, t2, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a4, t2, 8
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 15(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 14(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 80
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
%bitOff = shl i256 %byteOff, 3
%res = lshr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
@@ -1715,438 +1754,426 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: shl_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -64
; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: lbu s11, 23(a0)
-; RV64I-NEXT: lbu ra, 24(a0)
-; RV64I-NEXT: lbu t0, 25(a0)
-; RV64I-NEXT: lbu a7, 26(a0)
-; RV64I-NEXT: lbu a6, 27(a0)
-; RV64I-NEXT: lbu a5, 28(a0)
-; RV64I-NEXT: lbu a3, 31(a0)
-; RV64I-NEXT: lbu a4, 30(a0)
-; RV64I-NEXT: lbu a0, 29(a0)
-; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: sb a3, 119(sp)
-; RV64I-NEXT: sb a4, 118(sp)
-; RV64I-NEXT: sb a0, 117(sp)
-; RV64I-NEXT: sb a5, 116(sp)
-; RV64I-NEXT: sb a6, 115(sp)
-; RV64I-NEXT: sb a7, 114(sp)
-; RV64I-NEXT: sb zero, 87(sp)
-; RV64I-NEXT: sb zero, 86(sp)
-; RV64I-NEXT: sb zero, 85(sp)
-; RV64I-NEXT: sb zero, 84(sp)
-; RV64I-NEXT: sb zero, 83(sp)
-; RV64I-NEXT: sb zero, 82(sp)
-; RV64I-NEXT: sb zero, 81(sp)
-; RV64I-NEXT: sb zero, 80(sp)
-; RV64I-NEXT: sb zero, 79(sp)
-; RV64I-NEXT: sb zero, 78(sp)
-; RV64I-NEXT: sb zero, 77(sp)
-; RV64I-NEXT: sb zero, 76(sp)
-; RV64I-NEXT: sb zero, 75(sp)
-; RV64I-NEXT: sb zero, 74(sp)
-; RV64I-NEXT: sb zero, 73(sp)
-; RV64I-NEXT: sb zero, 72(sp)
-; RV64I-NEXT: sb zero, 71(sp)
-; RV64I-NEXT: sb zero, 70(sp)
-; RV64I-NEXT: sb zero, 69(sp)
-; RV64I-NEXT: sb zero, 68(sp)
-; RV64I-NEXT: sb zero, 67(sp)
-; RV64I-NEXT: sb zero, 66(sp)
-; RV64I-NEXT: sb zero, 65(sp)
-; RV64I-NEXT: sb zero, 64(sp)
-; RV64I-NEXT: sb zero, 63(sp)
-; RV64I-NEXT: sb zero, 62(sp)
-; RV64I-NEXT: sb zero, 61(sp)
-; RV64I-NEXT: sb zero, 60(sp)
-; RV64I-NEXT: sb zero, 59(sp)
-; RV64I-NEXT: sb zero, 58(sp)
-; RV64I-NEXT: sb zero, 57(sp)
-; RV64I-NEXT: sb zero, 56(sp)
-; RV64I-NEXT: sb t0, 113(sp)
-; RV64I-NEXT: sb ra, 112(sp)
-; RV64I-NEXT: sb s11, 111(sp)
-; RV64I-NEXT: sb s10, 110(sp)
-; RV64I-NEXT: sb s9, 109(sp)
-; RV64I-NEXT: sb s8, 108(sp)
-; RV64I-NEXT: sb s7, 107(sp)
-; RV64I-NEXT: sb s6, 106(sp)
-; RV64I-NEXT: sb s5, 105(sp)
-; RV64I-NEXT: sb s4, 104(sp)
-; RV64I-NEXT: sb s3, 103(sp)
-; RV64I-NEXT: sb s2, 102(sp)
-; RV64I-NEXT: sb s1, 101(sp)
-; RV64I-NEXT: sb s0, 100(sp)
-; RV64I-NEXT: sb t6, 99(sp)
-; RV64I-NEXT: sb t5, 98(sp)
-; RV64I-NEXT: sb t4, 97(sp)
-; RV64I-NEXT: sb t3, 96(sp)
-; RV64I-NEXT: sb t2, 95(sp)
-; RV64I-NEXT: sb t1, 94(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 93(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 92(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 91(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 90(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 88(sp)
-; RV64I-NEXT: andi a1, a1, 31
-; RV64I-NEXT: addi a0, sp, 88
-; RV64I-NEXT: sub a6, a0, a1
-; RV64I-NEXT: lbu a0, 8(a6)
-; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 9(a6)
-; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 10(a6)
-; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 11(a6)
-; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 12(a6)
-; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a7, 13(a6)
-; RV64I-NEXT: lbu t0, 14(a6)
-; RV64I-NEXT: lbu t1, 15(a6)
-; RV64I-NEXT: lbu t2, 0(a6)
-; RV64I-NEXT: lbu t3, 1(a6)
-; RV64I-NEXT: lbu t4, 2(a6)
-; RV64I-NEXT: lbu t5, 3(a6)
-; RV64I-NEXT: lbu t6, 4(a6)
-; RV64I-NEXT: lbu s0, 5(a6)
-; RV64I-NEXT: lbu s1, 6(a6)
-; RV64I-NEXT: lbu s2, 7(a6)
-; RV64I-NEXT: lbu s3, 24(a6)
-; RV64I-NEXT: lbu s4, 25(a6)
-; RV64I-NEXT: lbu s5, 26(a6)
-; RV64I-NEXT: lbu s6, 27(a6)
-; RV64I-NEXT: lbu s7, 28(a6)
-; RV64I-NEXT: lbu s8, 29(a6)
-; RV64I-NEXT: lbu s9, 30(a6)
-; RV64I-NEXT: lbu s10, 31(a6)
-; RV64I-NEXT: lbu s11, 16(a6)
-; RV64I-NEXT: lbu ra, 17(a6)
-; RV64I-NEXT: lbu a5, 18(a6)
-; RV64I-NEXT: lbu a4, 19(a6)
-; RV64I-NEXT: lbu a0, 23(a6)
-; RV64I-NEXT: lbu a1, 22(a6)
-; RV64I-NEXT: lbu a3, 21(a6)
-; RV64I-NEXT: lbu a6, 20(a6)
-; RV64I-NEXT: sb a0, 23(a2)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 24(sp)
+; RV64I-NEXT: sd zero, 16(sp)
+; RV64I-NEXT: sd zero, 8(sp)
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a5, 48(sp)
+; RV64I-NEXT: sd a4, 40(sp)
+; RV64I-NEXT: sd a3, 32(sp)
+; RV64I-NEXT: andi a0, a1, 24
+; RV64I-NEXT: addi a3, sp, 32
+; RV64I-NEXT: sub a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: ld a5, 0(a3)
+; RV64I-NEXT: sll a6, a4, a1
+; RV64I-NEXT: andi a0, a1, 56
+; RV64I-NEXT: xori a7, a0, 63
+; RV64I-NEXT: srli a0, a5, 1
+; RV64I-NEXT: ld t0, 24(a3)
+; RV64I-NEXT: ld a3, 16(a3)
+; RV64I-NEXT: srl a0, a0, a7
+; RV64I-NEXT: or a0, a6, a0
+; RV64I-NEXT: sll t0, t0, a1
+; RV64I-NEXT: srli t1, a3, 1
+; RV64I-NEXT: srl t1, t1, a7
+; RV64I-NEXT: or t1, t0, t1
+; RV64I-NEXT: sll a3, a3, a1
+; RV64I-NEXT: srli a4, a4, 1
+; RV64I-NEXT: srl a4, a4, a7
+; RV64I-NEXT: or a4, a3, a4
+; RV64I-NEXT: sll a1, a5, a1
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: srli a3, a3, 56
+; RV64I-NEXT: sb a3, 23(a2)
+; RV64I-NEXT: srli a3, t0, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 7(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 6(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 5(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 3(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: srli a1, a6, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: sb t1, 24(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a1, a4, 48
; RV64I-NEXT: sb a1, 22(a2)
-; RV64I-NEXT: sb a3, 21(a2)
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: sb a5, 18(a2)
-; RV64I-NEXT: sb ra, 17(a2)
-; RV64I-NEXT: sb s11, 16(a2)
-; RV64I-NEXT: sb s10, 31(a2)
-; RV64I-NEXT: sb s9, 30(a2)
-; RV64I-NEXT: sb s8, 29(a2)
-; RV64I-NEXT: sb s7, 28(a2)
-; RV64I-NEXT: sb s6, 27(a2)
-; RV64I-NEXT: sb s5, 26(a2)
-; RV64I-NEXT: sb s4, 25(a2)
-; RV64I-NEXT: sb s3, 24(a2)
-; RV64I-NEXT: sb s2, 7(a2)
-; RV64I-NEXT: sb s1, 6(a2)
-; RV64I-NEXT: sb s0, 5(a2)
-; RV64I-NEXT: sb t6, 4(a2)
-; RV64I-NEXT: sb t5, 3(a2)
-; RV64I-NEXT: sb t4, 2(a2)
-; RV64I-NEXT: sb t3, 1(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t1, 15(a2)
-; RV64I-NEXT: sb t0, 14(a2)
-; RV64I-NEXT: sb a7, 13(a2)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 12(a2)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 11(a2)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 10(a2)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 18(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 17(a2)
+; RV64I-NEXT: srli a1, t1, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, t1, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, t1, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, t1, 24
+; RV64I-NEXT: sb a1, 27(a2)
+; RV64I-NEXT: srli a1, t1, 16
+; RV64I-NEXT: sb a1, 26(a2)
+; RV64I-NEXT: srli a1, t1, 8
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
; RV64I-NEXT: sb a0, 9(a2)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
; RV32I-LABEL: shl_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -80
+; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s10, 22(a0)
-; RV32I-NEXT: lbu s11, 23(a0)
-; RV32I-NEXT: lbu ra, 24(a0)
-; RV32I-NEXT: lbu t0, 25(a0)
-; RV32I-NEXT: lbu a7, 26(a0)
-; RV32I-NEXT: lbu a6, 27(a0)
-; RV32I-NEXT: lbu a5, 28(a0)
-; RV32I-NEXT: lbu a3, 31(a0)
-; RV32I-NEXT: lbu a4, 30(a0)
-; RV32I-NEXT: lbu a0, 29(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb a3, 91(sp)
-; RV32I-NEXT: sb a4, 90(sp)
-; RV32I-NEXT: sb a0, 89(sp)
-; RV32I-NEXT: sb a5, 88(sp)
-; RV32I-NEXT: sb a6, 87(sp)
-; RV32I-NEXT: sb a7, 86(sp)
-; RV32I-NEXT: sb zero, 59(sp)
-; RV32I-NEXT: sb zero, 58(sp)
-; RV32I-NEXT: sb zero, 57(sp)
-; RV32I-NEXT: sb zero, 56(sp)
-; RV32I-NEXT: sb zero, 55(sp)
-; RV32I-NEXT: sb zero, 54(sp)
-; RV32I-NEXT: sb zero, 53(sp)
-; RV32I-NEXT: sb zero, 52(sp)
-; RV32I-NEXT: sb zero, 51(sp)
-; RV32I-NEXT: sb zero, 50(sp)
-; RV32I-NEXT: sb zero, 49(sp)
-; RV32I-NEXT: sb zero, 48(sp)
-; RV32I-NEXT: sb zero, 47(sp)
-; RV32I-NEXT: sb zero, 46(sp)
-; RV32I-NEXT: sb zero, 45(sp)
-; RV32I-NEXT: sb zero, 44(sp)
-; RV32I-NEXT: sb zero, 43(sp)
-; RV32I-NEXT: sb zero, 42(sp)
-; RV32I-NEXT: sb zero, 41(sp)
-; RV32I-NEXT: sb zero, 40(sp)
-; RV32I-NEXT: sb zero, 39(sp)
-; RV32I-NEXT: sb zero, 38(sp)
-; RV32I-NEXT: sb zero, 37(sp)
-; RV32I-NEXT: sb zero, 36(sp)
-; RV32I-NEXT: sb zero, 35(sp)
-; RV32I-NEXT: sb zero, 34(sp)
-; RV32I-NEXT: sb zero, 33(sp)
-; RV32I-NEXT: sb zero, 32(sp)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb t0, 85(sp)
-; RV32I-NEXT: sb ra, 84(sp)
-; RV32I-NEXT: sb s11, 83(sp)
-; RV32I-NEXT: sb s10, 82(sp)
-; RV32I-NEXT: sb s9, 81(sp)
-; RV32I-NEXT: sb s8, 80(sp)
-; RV32I-NEXT: sb s7, 79(sp)
-; RV32I-NEXT: sb s6, 78(sp)
-; RV32I-NEXT: sb s5, 77(sp)
-; RV32I-NEXT: sb s4, 76(sp)
-; RV32I-NEXT: sb s3, 75(sp)
-; RV32I-NEXT: sb s2, 74(sp)
-; RV32I-NEXT: sb s1, 73(sp)
-; RV32I-NEXT: sb s0, 72(sp)
-; RV32I-NEXT: sb t6, 71(sp)
-; RV32I-NEXT: sb t5, 70(sp)
-; RV32I-NEXT: sb t4, 69(sp)
-; RV32I-NEXT: sb t3, 68(sp)
-; RV32I-NEXT: sb t2, 67(sp)
-; RV32I-NEXT: sb t1, 66(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 65(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 64(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 63(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 62(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 61(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 60(sp)
-; RV32I-NEXT: andi a1, a1, 31
-; RV32I-NEXT: addi a0, sp, 60
-; RV32I-NEXT: sub a6, a0, a1
-; RV32I-NEXT: lbu a0, 6(a6)
-; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 7(a6)
-; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 4(a6)
-; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 5(a6)
-; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 0(a6)
-; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a7, 1(a6)
-; RV32I-NEXT: lbu t0, 2(a6)
-; RV32I-NEXT: lbu t1, 3(a6)
-; RV32I-NEXT: lbu t2, 14(a6)
-; RV32I-NEXT: lbu t3, 15(a6)
-; RV32I-NEXT: lbu t4, 12(a6)
-; RV32I-NEXT: lbu t5, 13(a6)
-; RV32I-NEXT: lbu t6, 10(a6)
-; RV32I-NEXT: lbu s0, 11(a6)
-; RV32I-NEXT: lbu s1, 8(a6)
-; RV32I-NEXT: lbu s2, 9(a6)
-; RV32I-NEXT: lbu s3, 22(a6)
-; RV32I-NEXT: lbu s4, 23(a6)
-; RV32I-NEXT: lbu s5, 20(a6)
-; RV32I-NEXT: lbu s6, 21(a6)
-; RV32I-NEXT: lbu s7, 18(a6)
-; RV32I-NEXT: lbu s8, 19(a6)
-; RV32I-NEXT: lbu s9, 16(a6)
-; RV32I-NEXT: lbu s10, 17(a6)
-; RV32I-NEXT: lbu s11, 30(a6)
-; RV32I-NEXT: lbu ra, 31(a6)
-; RV32I-NEXT: lbu a5, 28(a6)
-; RV32I-NEXT: lbu a4, 29(a6)
-; RV32I-NEXT: lbu a0, 25(a6)
-; RV32I-NEXT: lbu a1, 24(a6)
-; RV32I-NEXT: lbu a3, 27(a6)
-; RV32I-NEXT: lbu a6, 26(a6)
-; RV32I-NEXT: sb a0, 25(a2)
-; RV32I-NEXT: sb a1, 24(a2)
-; RV32I-NEXT: sb a3, 27(a2)
-; RV32I-NEXT: sb a6, 26(a2)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu t2, 1(a1)
+; RV32I-NEXT: lbu t3, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: or a1, a1, t2
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw a0, 64(sp)
+; RV32I-NEXT: sw t1, 60(sp)
+; RV32I-NEXT: sw t0, 56(sp)
+; RV32I-NEXT: sw a7, 52(sp)
+; RV32I-NEXT: sw a6, 48(sp)
+; RV32I-NEXT: sw a5, 44(sp)
+; RV32I-NEXT: sw a4, 40(sp)
+; RV32I-NEXT: sw a3, 36(sp)
+; RV32I-NEXT: andi a0, a1, 28
+; RV32I-NEXT: addi a3, sp, 36
+; RV32I-NEXT: sub a6, a3, a0
+; RV32I-NEXT: lw a3, 4(a6)
+; RV32I-NEXT: slli a7, a1, 3
+; RV32I-NEXT: lw t0, 0(a6)
+; RV32I-NEXT: sll a4, a3, a7
+; RV32I-NEXT: andi a0, a7, 24
+; RV32I-NEXT: xori t1, a0, 31
+; RV32I-NEXT: srli a0, t0, 1
+; RV32I-NEXT: lw t2, 12(a6)
+; RV32I-NEXT: lw a5, 8(a6)
+; RV32I-NEXT: srl a0, a0, t1
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: sll t3, t2, a7
+; RV32I-NEXT: srli a1, a5, 1
+; RV32I-NEXT: srl a1, a1, t1
+; RV32I-NEXT: or a1, t3, a1
+; RV32I-NEXT: sll t4, a5, a7
+; RV32I-NEXT: srli a3, a3, 1
+; RV32I-NEXT: lw t5, 20(a6)
+; RV32I-NEXT: lw t6, 16(a6)
+; RV32I-NEXT: srl a3, a3, t1
+; RV32I-NEXT: or a3, t4, a3
+; RV32I-NEXT: sll s0, t5, a7
+; RV32I-NEXT: srli a5, t6, 1
+; RV32I-NEXT: srl a5, a5, t1
+; RV32I-NEXT: or a5, s0, a5
+; RV32I-NEXT: sll t6, t6, a7
+; RV32I-NEXT: srli t2, t2, 1
+; RV32I-NEXT: lw s1, 28(a6)
+; RV32I-NEXT: lw a6, 24(a6)
+; RV32I-NEXT: srl t2, t2, t1
+; RV32I-NEXT: or t2, t6, t2
+; RV32I-NEXT: sll s1, s1, a7
+; RV32I-NEXT: srli s2, a6, 1
+; RV32I-NEXT: srl s2, s2, t1
+; RV32I-NEXT: or s2, s1, s2
+; RV32I-NEXT: sll a6, a6, a7
+; RV32I-NEXT: srli t5, t5, 1
+; RV32I-NEXT: srl t1, t5, t1
+; RV32I-NEXT: or t1, a6, t1
+; RV32I-NEXT: sll a7, t0, a7
+; RV32I-NEXT: sb a7, 0(a2)
+; RV32I-NEXT: srli a6, a6, 24
+; RV32I-NEXT: sb a6, 27(a2)
+; RV32I-NEXT: srli s1, s1, 24
+; RV32I-NEXT: sb s1, 31(a2)
+; RV32I-NEXT: srli a6, t6, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli s0, s0, 24
+; RV32I-NEXT: sb s0, 23(a2)
+; RV32I-NEXT: srli a6, t4, 24
+; RV32I-NEXT: sb a6, 11(a2)
+; RV32I-NEXT: srli a6, t3, 24
+; RV32I-NEXT: sb a6, 15(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 3(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 2(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 1(a2)
+; RV32I-NEXT: srli a4, a4, 24
+; RV32I-NEXT: sb a4, 7(a2)
+; RV32I-NEXT: sb t1, 24(a2)
+; RV32I-NEXT: sb s2, 28(a2)
+; RV32I-NEXT: sb t2, 16(a2)
+; RV32I-NEXT: sb a5, 20(a2)
+; RV32I-NEXT: sb a3, 8(a2)
+; RV32I-NEXT: sb a1, 12(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a4, t1, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, t1, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, s2, 16
+; RV32I-NEXT: sb a4, 30(a2)
+; RV32I-NEXT: srli a4, s2, 8
; RV32I-NEXT: sb a4, 29(a2)
-; RV32I-NEXT: sb a5, 28(a2)
-; RV32I-NEXT: sb ra, 31(a2)
-; RV32I-NEXT: sb s11, 30(a2)
-; RV32I-NEXT: sb s10, 17(a2)
-; RV32I-NEXT: sb s9, 16(a2)
-; RV32I-NEXT: sb s8, 19(a2)
-; RV32I-NEXT: sb s7, 18(a2)
-; RV32I-NEXT: sb s6, 21(a2)
-; RV32I-NEXT: sb s5, 20(a2)
-; RV32I-NEXT: sb s4, 23(a2)
-; RV32I-NEXT: sb s3, 22(a2)
-; RV32I-NEXT: sb s2, 9(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb t5, 13(a2)
-; RV32I-NEXT: sb t4, 12(a2)
-; RV32I-NEXT: sb t3, 15(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t1, 3(a2)
-; RV32I-NEXT: sb t0, 2(a2)
-; RV32I-NEXT: sb a7, 1(a2)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 0(a2)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: srli a4, t2, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t2, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 21(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 9(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 13(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 6(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 80
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -2158,454 +2185,428 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: ashr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: mv t0, a1
-; RV64I-NEXT: lbu t1, 31(a0)
-; RV64I-NEXT: lbu a1, 0(a0)
-; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 1(a0)
-; RV64I-NEXT: sd a1, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 2(a0)
-; RV64I-NEXT: sd a1, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 3(a0)
-; RV64I-NEXT: sd a1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 4(a0)
-; RV64I-NEXT: sd a1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 5(a0)
-; RV64I-NEXT: sd a1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t2, 6(a0)
-; RV64I-NEXT: lbu t3, 7(a0)
-; RV64I-NEXT: lbu t4, 8(a0)
-; RV64I-NEXT: lbu t5, 9(a0)
-; RV64I-NEXT: lbu t6, 10(a0)
-; RV64I-NEXT: lbu s0, 11(a0)
-; RV64I-NEXT: lbu s1, 12(a0)
-; RV64I-NEXT: lbu s2, 13(a0)
-; RV64I-NEXT: lbu s3, 14(a0)
-; RV64I-NEXT: lbu s4, 15(a0)
-; RV64I-NEXT: lbu s5, 16(a0)
-; RV64I-NEXT: lbu s6, 17(a0)
-; RV64I-NEXT: lbu s7, 18(a0)
-; RV64I-NEXT: lbu s8, 19(a0)
-; RV64I-NEXT: lbu s9, 20(a0)
-; RV64I-NEXT: lbu s10, 21(a0)
-; RV64I-NEXT: lbu s11, 22(a0)
-; RV64I-NEXT: lbu ra, 23(a0)
-; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: lbu a6, 25(a0)
-; RV64I-NEXT: lbu a5, 26(a0)
-; RV64I-NEXT: lbu a4, 27(a0)
-; RV64I-NEXT: lbu a1, 30(a0)
-; RV64I-NEXT: lbu a3, 29(a0)
-; RV64I-NEXT: lbu a0, 28(a0)
-; RV64I-NEXT: lbu t0, 0(t0)
-; RV64I-NEXT: sb a1, 86(sp)
-; RV64I-NEXT: sb a3, 85(sp)
-; RV64I-NEXT: sb a0, 84(sp)
-; RV64I-NEXT: sb a4, 83(sp)
-; RV64I-NEXT: sb a5, 82(sp)
-; RV64I-NEXT: sb a6, 81(sp)
-; RV64I-NEXT: sb t1, 87(sp)
-; RV64I-NEXT: slli t1, t1, 56
-; RV64I-NEXT: sb a7, 80(sp)
-; RV64I-NEXT: sb ra, 79(sp)
-; RV64I-NEXT: sb s11, 78(sp)
-; RV64I-NEXT: sb s10, 77(sp)
-; RV64I-NEXT: sb s9, 76(sp)
-; RV64I-NEXT: sb s8, 75(sp)
-; RV64I-NEXT: sb s7, 74(sp)
-; RV64I-NEXT: sb s6, 73(sp)
-; RV64I-NEXT: sb s5, 72(sp)
-; RV64I-NEXT: sb s4, 71(sp)
-; RV64I-NEXT: sb s3, 70(sp)
-; RV64I-NEXT: sb s2, 69(sp)
-; RV64I-NEXT: sb s1, 68(sp)
-; RV64I-NEXT: sb s0, 67(sp)
-; RV64I-NEXT: sb t6, 66(sp)
-; RV64I-NEXT: sb t5, 65(sp)
-; RV64I-NEXT: sb t4, 64(sp)
-; RV64I-NEXT: sb t3, 63(sp)
-; RV64I-NEXT: sb t2, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: srai a0, t1, 63
-; RV64I-NEXT: sb a0, 112(sp)
-; RV64I-NEXT: sb a0, 104(sp)
-; RV64I-NEXT: sb a0, 96(sp)
-; RV64I-NEXT: sb a0, 88(sp)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a7, a0, 32
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 1(a1)
+; RV64I-NEXT: lbu t0, 0(a1)
+; RV64I-NEXT: lbu t1, 2(a1)
+; RV64I-NEXT: lbu t2, 3(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu t0, 5(a1)
+; RV64I-NEXT: lbu t1, 4(a1)
+; RV64I-NEXT: lbu t2, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t2
+; RV64I-NEXT: or a1, a1, t0
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: sraiw a0, a0, 31
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a0, 40(sp)
+; RV64I-NEXT: sd a0, 32(sp)
+; RV64I-NEXT: sd a6, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: andi a0, a1, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: srl a5, a4, a1
+; RV64I-NEXT: ld a6, 16(a3)
+; RV64I-NEXT: andi a0, a1, 56
+; RV64I-NEXT: xori a7, a0, 63
+; RV64I-NEXT: ld t0, 0(a3)
+; RV64I-NEXT: slli a0, a6, 1
+; RV64I-NEXT: sll a0, a0, a7
+; RV64I-NEXT: or a0, a5, a0
+; RV64I-NEXT: srl t0, t0, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a7
+; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: srl a6, a6, a1
+; RV64I-NEXT: slli t1, a3, 1
+; RV64I-NEXT: sll a7, t1, a7
+; RV64I-NEXT: or a7, a6, a7
+; RV64I-NEXT: sra a1, a3, a1
+; RV64I-NEXT: sb a6, 16(a2)
+; RV64I-NEXT: sb a1, 24(a2)
+; RV64I-NEXT: sb t0, 0(a2)
+; RV64I-NEXT: sb a5, 8(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 30(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 28(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 27(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 26(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: srli a1, a7, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a7, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a7, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a7, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a7, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a7, 16
+; RV64I-NEXT: sb a1, 18(a2)
+; RV64I-NEXT: srli a1, a7, 8
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 1(a2)
; RV64I-NEXT: srli a1, a0, 56
-; RV64I-NEXT: sb a1, 119(sp)
-; RV64I-NEXT: srli a3, a0, 48
-; RV64I-NEXT: sb a3, 118(sp)
-; RV64I-NEXT: srli a4, a0, 40
-; RV64I-NEXT: sb a4, 117(sp)
-; RV64I-NEXT: srli a5, a0, 32
-; RV64I-NEXT: sb a5, 116(sp)
-; RV64I-NEXT: srli a6, a0, 24
-; RV64I-NEXT: sb a6, 115(sp)
-; RV64I-NEXT: srli a7, a0, 16
-; RV64I-NEXT: sb a7, 114(sp)
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: sb a0, 113(sp)
-; RV64I-NEXT: sb a1, 111(sp)
-; RV64I-NEXT: sb a3, 110(sp)
-; RV64I-NEXT: sb a4, 109(sp)
-; RV64I-NEXT: sb a5, 108(sp)
-; RV64I-NEXT: sb a6, 107(sp)
-; RV64I-NEXT: sb a7, 106(sp)
-; RV64I-NEXT: sb a0, 105(sp)
-; RV64I-NEXT: sb a1, 103(sp)
-; RV64I-NEXT: sb a3, 102(sp)
-; RV64I-NEXT: sb a4, 101(sp)
-; RV64I-NEXT: sb a5, 100(sp)
-; RV64I-NEXT: sb a6, 99(sp)
-; RV64I-NEXT: sb a7, 98(sp)
-; RV64I-NEXT: sb a0, 97(sp)
-; RV64I-NEXT: sb a1, 95(sp)
-; RV64I-NEXT: sb a3, 94(sp)
-; RV64I-NEXT: sb a4, 93(sp)
-; RV64I-NEXT: sb a5, 92(sp)
-; RV64I-NEXT: sb a6, 91(sp)
-; RV64I-NEXT: sb a7, 90(sp)
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: andi a0, t0, 31
-; RV64I-NEXT: addi a1, sp, 56
-; RV64I-NEXT: add a6, a1, a0
-; RV64I-NEXT: lbu a0, 8(a6)
-; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 9(a6)
-; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 10(a6)
-; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 11(a6)
-; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 12(a6)
-; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a7, 13(a6)
-; RV64I-NEXT: lbu t0, 14(a6)
-; RV64I-NEXT: lbu t1, 15(a6)
-; RV64I-NEXT: lbu t2, 0(a6)
-; RV64I-NEXT: lbu t3, 1(a6)
-; RV64I-NEXT: lbu t4, 2(a6)
-; RV64I-NEXT: lbu t5, 3(a6)
-; RV64I-NEXT: lbu t6, 4(a6)
-; RV64I-NEXT: lbu s0, 5(a6)
-; RV64I-NEXT: lbu s1, 6(a6)
-; RV64I-NEXT: lbu s2, 7(a6)
-; RV64I-NEXT: lbu s3, 24(a6)
-; RV64I-NEXT: lbu s4, 25(a6)
-; RV64I-NEXT: lbu s5, 26(a6)
-; RV64I-NEXT: lbu s6, 27(a6)
-; RV64I-NEXT: lbu s7, 28(a6)
-; RV64I-NEXT: lbu s8, 29(a6)
-; RV64I-NEXT: lbu s9, 30(a6)
-; RV64I-NEXT: lbu s10, 31(a6)
-; RV64I-NEXT: lbu s11, 16(a6)
-; RV64I-NEXT: lbu ra, 17(a6)
-; RV64I-NEXT: lbu a5, 18(a6)
-; RV64I-NEXT: lbu a4, 19(a6)
-; RV64I-NEXT: lbu a0, 23(a6)
-; RV64I-NEXT: lbu a1, 22(a6)
-; RV64I-NEXT: lbu a3, 21(a6)
-; RV64I-NEXT: lbu a6, 20(a6)
-; RV64I-NEXT: sb a0, 23(a2)
-; RV64I-NEXT: sb a1, 22(a2)
-; RV64I-NEXT: sb a3, 21(a2)
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: sb a5, 18(a2)
-; RV64I-NEXT: sb ra, 17(a2)
-; RV64I-NEXT: sb s11, 16(a2)
-; RV64I-NEXT: sb s10, 31(a2)
-; RV64I-NEXT: sb s9, 30(a2)
-; RV64I-NEXT: sb s8, 29(a2)
-; RV64I-NEXT: sb s7, 28(a2)
-; RV64I-NEXT: sb s6, 27(a2)
-; RV64I-NEXT: sb s5, 26(a2)
-; RV64I-NEXT: sb s4, 25(a2)
-; RV64I-NEXT: sb s3, 24(a2)
-; RV64I-NEXT: sb s2, 7(a2)
-; RV64I-NEXT: sb s1, 6(a2)
-; RV64I-NEXT: sb s0, 5(a2)
-; RV64I-NEXT: sb t6, 4(a2)
-; RV64I-NEXT: sb t5, 3(a2)
-; RV64I-NEXT: sb t4, 2(a2)
-; RV64I-NEXT: sb t3, 1(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t1, 15(a2)
-; RV64I-NEXT: sb t0, 14(a2)
-; RV64I-NEXT: sb a7, 13(a2)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 12(a2)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 11(a2)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 10(a2)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: sb a0, 9(a2)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
; RV32I-LABEL: ashr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv t0, a1
-; RV32I-NEXT: lbu t1, 31(a0)
-; RV32I-NEXT: lbu a1, 0(a0)
-; RV32I-NEXT: sw a1, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 1(a0)
-; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 2(a0)
-; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 3(a0)
-; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 4(a0)
-; RV32I-NEXT: sw a1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 5(a0)
-; RV32I-NEXT: sw a1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t2, 6(a0)
-; RV32I-NEXT: lbu t3, 7(a0)
-; RV32I-NEXT: lbu t4, 8(a0)
-; RV32I-NEXT: lbu t5, 9(a0)
-; RV32I-NEXT: lbu t6, 10(a0)
-; RV32I-NEXT: lbu s0, 11(a0)
-; RV32I-NEXT: lbu s1, 12(a0)
-; RV32I-NEXT: lbu s2, 13(a0)
-; RV32I-NEXT: lbu s3, 14(a0)
-; RV32I-NEXT: lbu s4, 15(a0)
-; RV32I-NEXT: lbu s5, 16(a0)
-; RV32I-NEXT: lbu s6, 17(a0)
-; RV32I-NEXT: lbu s7, 18(a0)
-; RV32I-NEXT: lbu s8, 19(a0)
-; RV32I-NEXT: lbu s9, 20(a0)
-; RV32I-NEXT: lbu s10, 21(a0)
-; RV32I-NEXT: lbu s11, 22(a0)
-; RV32I-NEXT: lbu ra, 23(a0)
-; RV32I-NEXT: lbu a7, 24(a0)
-; RV32I-NEXT: lbu a6, 25(a0)
-; RV32I-NEXT: lbu a5, 26(a0)
-; RV32I-NEXT: lbu a4, 27(a0)
-; RV32I-NEXT: lbu a1, 30(a0)
-; RV32I-NEXT: lbu a3, 29(a0)
-; RV32I-NEXT: lbu a0, 28(a0)
-; RV32I-NEXT: lbu t0, 0(t0)
-; RV32I-NEXT: sb a1, 58(sp)
-; RV32I-NEXT: sb a3, 57(sp)
-; RV32I-NEXT: sb a0, 56(sp)
-; RV32I-NEXT: sb a4, 55(sp)
-; RV32I-NEXT: sb a5, 54(sp)
-; RV32I-NEXT: sb a6, 53(sp)
-; RV32I-NEXT: sb t1, 59(sp)
+; RV32I-NEXT: addi sp, sp, -80
+; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: sb a7, 52(sp)
-; RV32I-NEXT: sb ra, 51(sp)
-; RV32I-NEXT: sb s11, 50(sp)
-; RV32I-NEXT: sb s10, 49(sp)
-; RV32I-NEXT: sb s9, 48(sp)
-; RV32I-NEXT: sb s8, 47(sp)
-; RV32I-NEXT: sb s7, 46(sp)
-; RV32I-NEXT: sb s6, 45(sp)
-; RV32I-NEXT: sb s5, 44(sp)
-; RV32I-NEXT: sb s4, 43(sp)
-; RV32I-NEXT: sb s3, 42(sp)
-; RV32I-NEXT: sb s2, 41(sp)
-; RV32I-NEXT: sb s1, 40(sp)
-; RV32I-NEXT: sb s0, 39(sp)
-; RV32I-NEXT: sb t6, 38(sp)
-; RV32I-NEXT: sb t5, 37(sp)
-; RV32I-NEXT: sb t4, 36(sp)
-; RV32I-NEXT: sb t3, 35(sp)
-; RV32I-NEXT: sb t2, 34(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 33(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 32(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 30(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 29(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 28(sp)
-; RV32I-NEXT: srai a0, t1, 31
-; RV32I-NEXT: sb a0, 88(sp)
-; RV32I-NEXT: sb a0, 84(sp)
-; RV32I-NEXT: sb a0, 80(sp)
-; RV32I-NEXT: sb a0, 76(sp)
-; RV32I-NEXT: sb a0, 72(sp)
-; RV32I-NEXT: sb a0, 68(sp)
-; RV32I-NEXT: sb a0, 64(sp)
-; RV32I-NEXT: sb a0, 60(sp)
-; RV32I-NEXT: srli a1, a0, 24
-; RV32I-NEXT: sb a1, 91(sp)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 90(sp)
-; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 89(sp)
-; RV32I-NEXT: sb a1, 87(sp)
-; RV32I-NEXT: sb a3, 86(sp)
-; RV32I-NEXT: sb a0, 85(sp)
-; RV32I-NEXT: sb a1, 83(sp)
-; RV32I-NEXT: sb a3, 82(sp)
-; RV32I-NEXT: sb a0, 81(sp)
-; RV32I-NEXT: sb a1, 79(sp)
-; RV32I-NEXT: sb a3, 78(sp)
-; RV32I-NEXT: sb a0, 77(sp)
-; RV32I-NEXT: sb a1, 75(sp)
-; RV32I-NEXT: sb a3, 74(sp)
-; RV32I-NEXT: sb a0, 73(sp)
-; RV32I-NEXT: sb a1, 71(sp)
-; RV32I-NEXT: sb a3, 70(sp)
-; RV32I-NEXT: sb a0, 69(sp)
-; RV32I-NEXT: sb a1, 67(sp)
-; RV32I-NEXT: sb a3, 66(sp)
-; RV32I-NEXT: sb a0, 65(sp)
-; RV32I-NEXT: sb a1, 63(sp)
-; RV32I-NEXT: sb a3, 62(sp)
-; RV32I-NEXT: sb a0, 61(sp)
-; RV32I-NEXT: andi a0, t0, 31
-; RV32I-NEXT: addi a1, sp, 28
-; RV32I-NEXT: add a6, a1, a0
-; RV32I-NEXT: lbu a0, 6(a6)
-; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 7(a6)
-; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 4(a6)
-; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 5(a6)
-; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 0(a6)
-; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a7, 1(a6)
-; RV32I-NEXT: lbu t0, 2(a6)
-; RV32I-NEXT: lbu t1, 3(a6)
-; RV32I-NEXT: lbu t2, 14(a6)
-; RV32I-NEXT: lbu t3, 15(a6)
-; RV32I-NEXT: lbu t4, 12(a6)
-; RV32I-NEXT: lbu t5, 13(a6)
-; RV32I-NEXT: lbu t6, 10(a6)
-; RV32I-NEXT: lbu s0, 11(a6)
-; RV32I-NEXT: lbu s1, 8(a6)
-; RV32I-NEXT: lbu s2, 9(a6)
-; RV32I-NEXT: lbu s3, 22(a6)
-; RV32I-NEXT: lbu s4, 23(a6)
-; RV32I-NEXT: lbu s5, 20(a6)
-; RV32I-NEXT: lbu s6, 21(a6)
-; RV32I-NEXT: lbu s7, 18(a6)
-; RV32I-NEXT: lbu s8, 19(a6)
-; RV32I-NEXT: lbu s9, 16(a6)
-; RV32I-NEXT: lbu s10, 17(a6)
-; RV32I-NEXT: lbu s11, 30(a6)
-; RV32I-NEXT: lbu ra, 31(a6)
-; RV32I-NEXT: lbu a5, 28(a6)
-; RV32I-NEXT: lbu a4, 29(a6)
-; RV32I-NEXT: lbu a0, 25(a6)
-; RV32I-NEXT: lbu a1, 24(a6)
-; RV32I-NEXT: lbu a3, 27(a6)
-; RV32I-NEXT: lbu a6, 26(a6)
-; RV32I-NEXT: sb a0, 25(a2)
-; RV32I-NEXT: sb a1, 24(a2)
-; RV32I-NEXT: sb a3, 27(a2)
-; RV32I-NEXT: sb a6, 26(a2)
-; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, a0, t4
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: lbu t3, 1(a1)
+; RV32I-NEXT: lbu t4, 0(a1)
+; RV32I-NEXT: lbu t5, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or t3, t3, t4
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t5
+; RV32I-NEXT: or a1, a1, t3
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 64(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a0, 56(sp)
+; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 44(sp)
+; RV32I-NEXT: sw a0, 40(sp)
+; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: sw t2, 32(sp)
+; RV32I-NEXT: sw t1, 28(sp)
+; RV32I-NEXT: sw t0, 24(sp)
+; RV32I-NEXT: sw a7, 20(sp)
+; RV32I-NEXT: sw a6, 16(sp)
+; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a3, 4(sp)
+; RV32I-NEXT: andi a0, a1, 28
+; RV32I-NEXT: addi a3, sp, 4
+; RV32I-NEXT: add a5, a3, a0
+; RV32I-NEXT: lw a3, 4(a5)
+; RV32I-NEXT: slli a6, a1, 3
+; RV32I-NEXT: srl a4, a3, a6
+; RV32I-NEXT: lw a7, 8(a5)
+; RV32I-NEXT: andi a0, a6, 24
+; RV32I-NEXT: xori t0, a0, 31
+; RV32I-NEXT: lw a1, 0(a5)
+; RV32I-NEXT: slli a0, a7, 1
+; RV32I-NEXT: sll a0, a0, t0
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: srl t1, a1, a6
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw t2, 12(a5)
+; RV32I-NEXT: lw t3, 16(a5)
+; RV32I-NEXT: sll a1, a3, t0
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: srl t4, t2, a6
+; RV32I-NEXT: slli a3, t3, 1
+; RV32I-NEXT: sll a3, a3, t0
+; RV32I-NEXT: or a3, t4, a3
+; RV32I-NEXT: srl a7, a7, a6
+; RV32I-NEXT: slli t2, t2, 1
+; RV32I-NEXT: lw t5, 20(a5)
+; RV32I-NEXT: lw t6, 24(a5)
+; RV32I-NEXT: sll t2, t2, t0
+; RV32I-NEXT: or t2, a7, t2
+; RV32I-NEXT: srl s0, t5, a6
+; RV32I-NEXT: slli s1, t6, 1
+; RV32I-NEXT: sll s1, s1, t0
+; RV32I-NEXT: or s1, s0, s1
+; RV32I-NEXT: srl t3, t3, a6
+; RV32I-NEXT: slli t5, t5, 1
+; RV32I-NEXT: lw a5, 28(a5)
+; RV32I-NEXT: sll t5, t5, t0
+; RV32I-NEXT: or t5, t3, t5
+; RV32I-NEXT: srl t6, t6, a6
+; RV32I-NEXT: slli s2, a5, 1
+; RV32I-NEXT: sll t0, s2, t0
+; RV32I-NEXT: or t0, t6, t0
+; RV32I-NEXT: sra a5, a5, a6
+; RV32I-NEXT: sb t6, 24(a2)
; RV32I-NEXT: sb a5, 28(a2)
-; RV32I-NEXT: sb ra, 31(a2)
-; RV32I-NEXT: sb s11, 30(a2)
-; RV32I-NEXT: sb s10, 17(a2)
-; RV32I-NEXT: sb s9, 16(a2)
-; RV32I-NEXT: sb s8, 19(a2)
-; RV32I-NEXT: sb s7, 18(a2)
-; RV32I-NEXT: sb s6, 21(a2)
-; RV32I-NEXT: sb s5, 20(a2)
-; RV32I-NEXT: sb s4, 23(a2)
-; RV32I-NEXT: sb s3, 22(a2)
-; RV32I-NEXT: sb s2, 9(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb t5, 13(a2)
+; RV32I-NEXT: sb t3, 16(a2)
+; RV32I-NEXT: sb s0, 20(a2)
+; RV32I-NEXT: sb a7, 8(a2)
; RV32I-NEXT: sb t4, 12(a2)
-; RV32I-NEXT: sb t3, 15(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t1, 3(a2)
-; RV32I-NEXT: sb t0, 2(a2)
-; RV32I-NEXT: sb a7, 1(a2)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 0(a2)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: sb t1, 0(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a4, a5, 24
+; RV32I-NEXT: sb a4, 31(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 30(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 29(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 27(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, t5, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t5, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t5, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, s1, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, s1, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli s1, s1, 8
+; RV32I-NEXT: sb s1, 21(a2)
+; RV32I-NEXT: srli a4, t2, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, t2, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a4, t2, 8
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 15(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 14(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 6(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 80
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index a601256bc2afa..7e879b137b4f0 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -704,164 +704,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: lshr_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -64
-; RV32I-NEXT: sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 1(a1)
-; RV32I-NEXT: lbu s1, 0(a1)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 13(a0)
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: or s0, s0, s1
-; RV32I-NEXT: lbu s1, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu a0, 15(a0)
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, s1
-; RV32I-NEXT: or a1, a1, s0
-; RV32I-NEXT: sb zero, 43(sp)
-; RV32I-NEXT: sb zero, 42(sp)
-; RV32I-NEXT: sb zero, 41(sp)
-; RV32I-NEXT: sb zero, 40(sp)
-; RV32I-NEXT: sb zero, 39(sp)
-; RV32I-NEXT: sb zero, 38(sp)
-; RV32I-NEXT: sb zero, 37(sp)
-; RV32I-NEXT: sb zero, 36(sp)
-; RV32I-NEXT: sb zero, 35(sp)
-; RV32I-NEXT: sb zero, 34(sp)
-; RV32I-NEXT: sb zero, 33(sp)
-; RV32I-NEXT: sb zero, 32(sp)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb a0, 27(sp)
-; RV32I-NEXT: sb s4, 26(sp)
-; RV32I-NEXT: sb s3, 25(sp)
-; RV32I-NEXT: sb s2, 24(sp)
-; RV32I-NEXT: sb t6, 23(sp)
-; RV32I-NEXT: sb t5, 22(sp)
-; RV32I-NEXT: sb t4, 21(sp)
-; RV32I-NEXT: sb t3, 20(sp)
-; RV32I-NEXT: sb t2, 19(sp)
-; RV32I-NEXT: sb t1, 18(sp)
-; RV32I-NEXT: sb t0, 17(sp)
-; RV32I-NEXT: sb a7, 16(sp)
-; RV32I-NEXT: sb a6, 15(sp)
-; RV32I-NEXT: sb a5, 14(sp)
-; RV32I-NEXT: sb a4, 13(sp)
-; RV32I-NEXT: sb a3, 12(sp)
-; RV32I-NEXT: slli a0, a1, 25
-; RV32I-NEXT: srli a0, a0, 28
-; RV32I-NEXT: addi a3, sp, 12
-; RV32I-NEXT: add a3, a3, a0
-; RV32I-NEXT: lbu a0, 5(a3)
-; RV32I-NEXT: lbu a4, 4(a3)
-; RV32I-NEXT: lbu a5, 6(a3)
-; RV32I-NEXT: lbu a6, 7(a3)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, a4, a0
-; RV32I-NEXT: andi a4, a1, 7
-; RV32I-NEXT: srl a0, a5, a4
-; RV32I-NEXT: lbu a1, 9(a3)
-; RV32I-NEXT: lbu a6, 8(a3)
-; RV32I-NEXT: lbu a7, 10(a3)
-; RV32I-NEXT: lbu t0, 11(a3)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a6, a6, a1
-; RV32I-NEXT: slli a1, a6, 1
-; RV32I-NEXT: not a7, a4
-; RV32I-NEXT: sll a1, a1, a7
-; RV32I-NEXT: or a1, a0, a1
-; RV32I-NEXT: lbu a7, 1(a3)
-; RV32I-NEXT: lbu t0, 0(a3)
-; RV32I-NEXT: lbu t1, 2(a3)
-; RV32I-NEXT: lbu t2, 3(a3)
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: or a7, a7, t0
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: srl a7, a7, a4
-; RV32I-NEXT: slli a5, a5, 1
-; RV32I-NEXT: xori t0, a4, 31
-; RV32I-NEXT: sll a5, a5, t0
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: srl a6, a6, a4
-; RV32I-NEXT: lbu t1, 13(a3)
-; RV32I-NEXT: lbu t2, 12(a3)
-; RV32I-NEXT: lbu t3, 14(a3)
-; RV32I-NEXT: lbu a3, 15(a3)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: or t1, t1, t2
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a3, a3, t3
-; RV32I-NEXT: or a3, a3, t1
-; RV32I-NEXT: slli t1, a3, 1
-; RV32I-NEXT: sll t0, t1, t0
-; RV32I-NEXT: or t0, a6, t0
-; RV32I-NEXT: srl a3, a3, a4
-; RV32I-NEXT: sb a6, 8(a2)
-; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: srli a4, a6, 16
-; RV32I-NEXT: sb a4, 10(a2)
-; RV32I-NEXT: srli a4, a6, 8
-; RV32I-NEXT: sb a4, 9(a2)
-; RV32I-NEXT: srli a4, a3, 16
-; RV32I-NEXT: sb a4, 14(a2)
-; RV32I-NEXT: srli a4, a3, 24
-; RV32I-NEXT: sb a4, 15(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 13(a2)
-; RV32I-NEXT: srli a3, a7, 16
-; RV32I-NEXT: sb a3, 2(a2)
-; RV32I-NEXT: srli a3, a7, 8
-; RV32I-NEXT: sb a3, 1(a2)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 6(a2)
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a0, a3, a0
+; RV32I-NEXT: lw a3, 4(a0)
+; RV32I-NEXT: srl a4, a3, a1
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: andi a6, a1, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 0(a0)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a4, a4, t0
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: sll a3, a3, a6
+; RV32I-NEXT: or a3, a7, a3
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli a7, a0, 1
+; RV32I-NEXT: sll a6, a7, a6
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: sb a0, 12(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 15(a2)
; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: srli a0, t0, 24
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a3, 0(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a0, a5, 16
+; RV32I-NEXT: sb a0, 10(a2)
+; RV32I-NEXT: srli a0, a5, 24
; RV32I-NEXT: sb a0, 11(a2)
-; RV32I-NEXT: srli a5, a5, 24
-; RV32I-NEXT: sb a5, 3(a2)
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: sb a1, 7(a2)
-; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a0, a3, 16
+; RV32I-NEXT: sb a0, 2(a2)
+; RV32I-NEXT: srli a0, a3, 24
+; RV32I-NEXT: sb a0, 3(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: srli a0, a4, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, a4, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -987,164 +940,117 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: shl_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -64
-; RV32I-NEXT: sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 1(a1)
-; RV32I-NEXT: lbu s1, 0(a1)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 13(a0)
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: or s0, s0, s1
-; RV32I-NEXT: lbu s1, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu a0, 15(a0)
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, s1
-; RV32I-NEXT: or a1, a1, s0
-; RV32I-NEXT: sb zero, 27(sp)
-; RV32I-NEXT: sb zero, 26(sp)
-; RV32I-NEXT: sb zero, 25(sp)
-; RV32I-NEXT: sb zero, 24(sp)
-; RV32I-NEXT: sb zero, 23(sp)
-; RV32I-NEXT: sb zero, 22(sp)
-; RV32I-NEXT: sb zero, 21(sp)
-; RV32I-NEXT: sb zero, 20(sp)
-; RV32I-NEXT: sb zero, 19(sp)
-; RV32I-NEXT: sb zero, 18(sp)
-; RV32I-NEXT: sb zero, 17(sp)
-; RV32I-NEXT: sb zero, 16(sp)
-; RV32I-NEXT: sb zero, 15(sp)
-; RV32I-NEXT: sb zero, 14(sp)
-; RV32I-NEXT: sb zero, 13(sp)
-; RV32I-NEXT: sb zero, 12(sp)
-; RV32I-NEXT: sb a0, 43(sp)
-; RV32I-NEXT: sb s4, 42(sp)
-; RV32I-NEXT: sb s3, 41(sp)
-; RV32I-NEXT: sb s2, 40(sp)
-; RV32I-NEXT: sb t6, 39(sp)
-; RV32I-NEXT: sb t5, 38(sp)
-; RV32I-NEXT: sb t4, 37(sp)
-; RV32I-NEXT: sb t3, 36(sp)
-; RV32I-NEXT: sb t2, 35(sp)
-; RV32I-NEXT: sb t1, 34(sp)
-; RV32I-NEXT: sb t0, 33(sp)
-; RV32I-NEXT: sb a7, 32(sp)
-; RV32I-NEXT: sb a6, 31(sp)
-; RV32I-NEXT: sb a5, 30(sp)
-; RV32I-NEXT: sb a4, 29(sp)
-; RV32I-NEXT: sb a3, 28(sp)
-; RV32I-NEXT: slli a0, a1, 25
-; RV32I-NEXT: srli a0, a0, 28
-; RV32I-NEXT: addi a3, sp, 28
-; RV32I-NEXT: sub a3, a3, a0
-; RV32I-NEXT: lbu a0, 5(a3)
-; RV32I-NEXT: lbu a4, 4(a3)
-; RV32I-NEXT: lbu a5, 6(a3)
-; RV32I-NEXT: lbu a6, 7(a3)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, a4, a0
-; RV32I-NEXT: andi a4, a1, 7
-; RV32I-NEXT: sll a0, a5, a4
-; RV32I-NEXT: lbu a1, 1(a3)
-; RV32I-NEXT: lbu a6, 0(a3)
-; RV32I-NEXT: lbu a7, 2(a3)
-; RV32I-NEXT: lbu t0, 3(a3)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a6, a6, a1
-; RV32I-NEXT: srli a1, a6, 1
-; RV32I-NEXT: xori a7, a4, 31
-; RV32I-NEXT: srl a1, a1, a7
-; RV32I-NEXT: or a1, a0, a1
-; RV32I-NEXT: lbu t0, 13(a3)
-; RV32I-NEXT: lbu t1, 12(a3)
-; RV32I-NEXT: lbu t2, 14(a3)
-; RV32I-NEXT: lbu t3, 15(a3)
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or t0, t0, t1
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: or t1, t3, t2
-; RV32I-NEXT: or t0, t1, t0
-; RV32I-NEXT: sll t0, t0, a4
-; RV32I-NEXT: lbu t1, 9(a3)
-; RV32I-NEXT: lbu t2, 8(a3)
-; RV32I-NEXT: lbu t3, 10(a3)
-; RV32I-NEXT: lbu a3, 11(a3)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: or t1, t1, t2
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a3, a3, t3
-; RV32I-NEXT: or a3, a3, t1
-; RV32I-NEXT: srli t1, a3, 1
-; RV32I-NEXT: srl a7, t1, a7
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: sll a3, a3, a4
-; RV32I-NEXT: srli a5, a5, 1
-; RV32I-NEXT: not t1, a4
-; RV32I-NEXT: srl a5, a5, t1
-; RV32I-NEXT: or a5, a3, a5
-; RV32I-NEXT: sll a4, a6, a4
-; RV32I-NEXT: sb a4, 0(a2)
-; RV32I-NEXT: srli a6, a3, 16
-; RV32I-NEXT: sb a6, 10(a2)
-; RV32I-NEXT: srli a6, a3, 24
-; RV32I-NEXT: sb a6, 11(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 9(a2)
-; RV32I-NEXT: srli a3, t0, 16
-; RV32I-NEXT: sb a3, 14(a2)
-; RV32I-NEXT: srli a3, t0, 24
-; RV32I-NEXT: sb a3, 15(a2)
-; RV32I-NEXT: srli a3, t0, 8
-; RV32I-NEXT: sb a3, 13(a2)
-; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: addi a3, sp, 16
+; RV32I-NEXT: sub a3, a3, a0
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a4, 0(a3)
+; RV32I-NEXT: sll a5, a0, a1
+; RV32I-NEXT: andi a6, a1, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: srli a7, a4, 1
+; RV32I-NEXT: lw t0, 12(a3)
+; RV32I-NEXT: lw a3, 8(a3)
+; RV32I-NEXT: srl a7, a7, a6
+; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: sll a7, t0, a1
+; RV32I-NEXT: srli t0, a3, 1
+; RV32I-NEXT: srl t0, t0, a6
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: sll a3, a3, a1
+; RV32I-NEXT: srli a0, a0, 1
+; RV32I-NEXT: srl a0, a0, a6
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: sll a1, a4, a1
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: srli a3, a1, 16
; RV32I-NEXT: sb a3, 2(a2)
-; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: srli a3, a1, 24
; RV32I-NEXT: sb a3, 3(a2)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 1(a2)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 6(a2)
-; RV32I-NEXT: srli a3, a0, 24
-; RV32I-NEXT: sb a3, 7(a2)
-; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: sb a0, 8(a2)
; RV32I-NEXT: sb a7, 12(a2)
-; RV32I-NEXT: sb a1, 4(a2)
-; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: sb a5, 4(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 10(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 11(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 9(a2)
+; RV32I-NEXT: srli a0, a7, 16
+; RV32I-NEXT: sb a0, 14(a2)
+; RV32I-NEXT: srli a0, a7, 24
+; RV32I-NEXT: sb a0, 15(a2)
+; RV32I-NEXT: srli a0, a7, 8
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: srli a0, a5, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, a5, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1270,171 +1176,118 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: ashr_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -64
-; RV32I-NEXT: sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 15(a0)
-; RV32I-NEXT: slli a4, a3, 24
-; RV32I-NEXT: lbu a5, 0(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 2(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 4(a0)
-; RV32I-NEXT: lbu t2, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t4, 7(a0)
-; RV32I-NEXT: lbu t5, 8(a0)
-; RV32I-NEXT: lbu t6, 9(a0)
-; RV32I-NEXT: lbu s0, 10(a0)
-; RV32I-NEXT: lbu s1, 1(a1)
-; RV32I-NEXT: lbu s2, 0(a1)
-; RV32I-NEXT: lbu s3, 11(a0)
-; RV32I-NEXT: lbu s4, 12(a0)
-; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: or s1, s1, s2
-; RV32I-NEXT: lbu s2, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: lbu s5, 13(a0)
-; RV32I-NEXT: lbu a0, 14(a0)
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, s2
-; RV32I-NEXT: or a1, a1, s1
-; RV32I-NEXT: sb a3, 23(sp)
-; RV32I-NEXT: sb a0, 22(sp)
-; RV32I-NEXT: sb s5, 21(sp)
-; RV32I-NEXT: sb s4, 20(sp)
-; RV32I-NEXT: sb s3, 19(sp)
-; RV32I-NEXT: sb s0, 18(sp)
-; RV32I-NEXT: sb t6, 17(sp)
-; RV32I-NEXT: sb t5, 16(sp)
-; RV32I-NEXT: sb t4, 15(sp)
-; RV32I-NEXT: sb t3, 14(sp)
-; RV32I-NEXT: sb t2, 13(sp)
-; RV32I-NEXT: sb t1, 12(sp)
-; RV32I-NEXT: sb t0, 11(sp)
-; RV32I-NEXT: sb a7, 10(sp)
-; RV32I-NEXT: sb a6, 9(sp)
-; RV32I-NEXT: sb a5, 8(sp)
-; RV32I-NEXT: srai a4, a4, 31
-; RV32I-NEXT: sb a4, 36(sp)
-; RV32I-NEXT: sb a4, 32(sp)
-; RV32I-NEXT: sb a4, 28(sp)
-; RV32I-NEXT: sb a4, 24(sp)
-; RV32I-NEXT: srli a0, a4, 24
-; RV32I-NEXT: sb a0, 39(sp)
-; RV32I-NEXT: srli a3, a4, 16
-; RV32I-NEXT: sb a3, 38(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 37(sp)
-; RV32I-NEXT: sb a0, 35(sp)
-; RV32I-NEXT: sb a3, 34(sp)
-; RV32I-NEXT: sb a4, 33(sp)
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: sb a3, 30(sp)
-; RV32I-NEXT: sb a4, 29(sp)
-; RV32I-NEXT: sb a0, 27(sp)
-; RV32I-NEXT: sb a3, 26(sp)
-; RV32I-NEXT: sb a4, 25(sp)
-; RV32I-NEXT: slli a0, a1, 25
-; RV32I-NEXT: srli a0, a0, 28
-; RV32I-NEXT: addi a3, sp, 8
-; RV32I-NEXT: add a3, a3, a0
-; RV32I-NEXT: lbu a0, 5(a3)
-; RV32I-NEXT: lbu a4, 4(a3)
-; RV32I-NEXT: lbu a5, 6(a3)
-; RV32I-NEXT: lbu a6, 7(a3)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, a4, a0
-; RV32I-NEXT: andi a4, a1, 7
-; RV32I-NEXT: srl a0, a5, a4
-; RV32I-NEXT: lbu a1, 9(a3)
-; RV32I-NEXT: lbu a6, 8(a3)
-; RV32I-NEXT: lbu a7, 10(a3)
-; RV32I-NEXT: lbu t0, 11(a3)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a6, a6, a1
-; RV32I-NEXT: slli a1, a6, 1
-; RV32I-NEXT: not a7, a4
-; RV32I-NEXT: sll a1, a1, a7
-; RV32I-NEXT: or a1, a0, a1
-; RV32I-NEXT: lbu a7, 1(a3)
-; RV32I-NEXT: lbu t0, 0(a3)
-; RV32I-NEXT: lbu t1, 2(a3)
-; RV32I-NEXT: lbu t2, 3(a3)
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a7, a0, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t1, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t0
; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: srl a7, a7, a4
-; RV32I-NEXT: slli a5, a5, 1
-; RV32I-NEXT: xori t0, a4, 31
-; RV32I-NEXT: sll a5, a5, t0
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: srl a6, a6, a4
-; RV32I-NEXT: lbu t1, 13(a3)
-; RV32I-NEXT: lbu t2, 12(a3)
-; RV32I-NEXT: lbu t3, 14(a3)
-; RV32I-NEXT: lbu a3, 15(a3)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: or t1, t1, t2
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a3, a3, t3
-; RV32I-NEXT: or a3, a3, t1
-; RV32I-NEXT: slli t1, a3, 1
-; RV32I-NEXT: sll t0, t1, t0
-; RV32I-NEXT: or t0, a6, t0
-; RV32I-NEXT: sra a3, a3, a4
-; RV32I-NEXT: sb a6, 8(a2)
-; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: srli a4, a6, 16
-; RV32I-NEXT: sb a4, 10(a2)
-; RV32I-NEXT: srli a4, a6, 8
-; RV32I-NEXT: sb a4, 9(a2)
-; RV32I-NEXT: srli a4, a3, 16
-; RV32I-NEXT: sb a4, 14(a2)
-; RV32I-NEXT: srli a4, a3, 24
-; RV32I-NEXT: sb a4, 15(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 13(a2)
-; RV32I-NEXT: srli a3, a7, 16
-; RV32I-NEXT: sb a3, 2(a2)
-; RV32I-NEXT: srli a3, a7, 8
-; RV32I-NEXT: sb a3, 1(a2)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 6(a2)
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a0, a3, a0
+; RV32I-NEXT: lw a3, 4(a0)
+; RV32I-NEXT: srl a4, a3, a1
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: andi a6, a1, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 0(a0)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a4, a4, t0
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: sll a3, a3, a6
+; RV32I-NEXT: or a3, a7, a3
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli a7, a0, 1
+; RV32I-NEXT: sll a6, a7, a6
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: sra a0, a0, a1
+; RV32I-NEXT: sb a0, 12(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 15(a2)
; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: srli a0, t0, 24
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a3, 0(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a0, a5, 16
+; RV32I-NEXT: sb a0, 10(a2)
+; RV32I-NEXT: srli a0, a5, 24
; RV32I-NEXT: sb a0, 11(a2)
-; RV32I-NEXT: srli a5, a5, 24
-; RV32I-NEXT: sb a5, 3(a2)
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: sb a1, 7(a2)
-; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a0, a3, 16
+; RV32I-NEXT: sb a0, 2(a2)
+; RV32I-NEXT: srli a0, a3, 24
+; RV32I-NEXT: sb a0, 3(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: srli a0, a4, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, a4, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1446,191 +1299,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: lshr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -64
; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 1(a1)
-; RV64I-NEXT: lbu s10, 0(a1)
-; RV64I-NEXT: lbu s11, 2(a1)
-; RV64I-NEXT: lbu ra, 3(a1)
-; RV64I-NEXT: slli s9, s9, 8
-; RV64I-NEXT: or s9, s9, s10
-; RV64I-NEXT: slli s11, s11, 16
-; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: lbu s10, 5(a1)
-; RV64I-NEXT: or s11, ra, s11
-; RV64I-NEXT: or s11, s11, s9
-; RV64I-NEXT: lbu s9, 4(a1)
-; RV64I-NEXT: slli s10, s10, 8
-; RV64I-NEXT: lbu ra, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or s10, s10, s9
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: slli ra, ra, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, ra
-; RV64I-NEXT: lbu ra, 22(a0)
-; RV64I-NEXT: or a1, a1, s10
-; RV64I-NEXT: lbu s10, 23(a0)
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or t0, a1, s11
-; RV64I-NEXT: lbu s11, 24(a0)
-; RV64I-NEXT: lbu a7, 25(a0)
-; RV64I-NEXT: lbu a6, 26(a0)
-; RV64I-NEXT: lbu a5, 27(a0)
-; RV64I-NEXT: lbu a1, 31(a0)
-; RV64I-NEXT: lbu a3, 30(a0)
-; RV64I-NEXT: lbu a4, 29(a0)
-; RV64I-NEXT: lbu a0, 28(a0)
-; RV64I-NEXT: sb a1, 87(sp)
-; RV64I-NEXT: sb a3, 86(sp)
-; RV64I-NEXT: sb a4, 85(sp)
-; RV64I-NEXT: sb a0, 84(sp)
-; RV64I-NEXT: sb a5, 83(sp)
-; RV64I-NEXT: sb a6, 82(sp)
-; RV64I-NEXT: sb a7, 81(sp)
-; RV64I-NEXT: sb s11, 80(sp)
-; RV64I-NEXT: sb s10, 79(sp)
-; RV64I-NEXT: sb ra, 78(sp)
-; RV64I-NEXT: sb s9, 77(sp)
-; RV64I-NEXT: sb s8, 76(sp)
-; RV64I-NEXT: sb s7, 75(sp)
-; RV64I-NEXT: sb s6, 74(sp)
-; RV64I-NEXT: sb s5, 73(sp)
-; RV64I-NEXT: sb s4, 72(sp)
-; RV64I-NEXT: sb s3, 71(sp)
-; RV64I-NEXT: sb s2, 70(sp)
-; RV64I-NEXT: sb s1, 69(sp)
-; RV64I-NEXT: sb s0, 68(sp)
-; RV64I-NEXT: sb t6, 67(sp)
-; RV64I-NEXT: sb t5, 66(sp)
-; RV64I-NEXT: sb t4, 65(sp)
-; RV64I-NEXT: sb zero, 119(sp)
-; RV64I-NEXT: sb zero, 118(sp)
-; RV64I-NEXT: sb zero, 117(sp)
-; RV64I-NEXT: sb zero, 116(sp)
-; RV64I-NEXT: sb zero, 115(sp)
-; RV64I-NEXT: sb zero, 114(sp)
-; RV64I-NEXT: sb zero, 113(sp)
-; RV64I-NEXT: sb zero, 112(sp)
-; RV64I-NEXT: sb zero, 111(sp)
-; RV64I-NEXT: sb zero, 110(sp)
-; RV64I-NEXT: sb zero, 109(sp)
-; RV64I-NEXT: sb zero, 108(sp)
-; RV64I-NEXT: sb zero, 107(sp)
-; RV64I-NEXT: sb zero, 106(sp)
-; RV64I-NEXT: sb zero, 105(sp)
-; RV64I-NEXT: sb zero, 104(sp)
-; RV64I-NEXT: sb zero, 103(sp)
-; RV64I-NEXT: sb zero, 102(sp)
-; RV64I-NEXT: sb zero, 101(sp)
-; RV64I-NEXT: sb zero, 100(sp)
-; RV64I-NEXT: sb zero, 99(sp)
-; RV64I-NEXT: sb zero, 98(sp)
-; RV64I-NEXT: sb zero, 97(sp)
-; RV64I-NEXT: sb zero, 96(sp)
-; RV64I-NEXT: sb zero, 95(sp)
-; RV64I-NEXT: sb zero, 94(sp)
-; RV64I-NEXT: sb zero, 93(sp)
-; RV64I-NEXT: sb zero, 92(sp)
-; RV64I-NEXT: sb zero, 91(sp)
-; RV64I-NEXT: sb zero, 90(sp)
-; RV64I-NEXT: sb zero, 89(sp)
-; RV64I-NEXT: sb zero, 88(sp)
-; RV64I-NEXT: sb t3, 64(sp)
-; RV64I-NEXT: sb t2, 63(sp)
-; RV64I-NEXT: sb t1, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: slli a0, t0, 56
-; RV64I-NEXT: srli a0, a0, 59
-; RV64I-NEXT: addi a3, sp, 56
-; RV64I-NEXT: add a3, a3, a0
-; RV64I-NEXT: lbu a0, 9(a3)
-; RV64I-NEXT: lbu a1, 8(a3)
-; RV64I-NEXT: lbu a4, 10(a3)
-; RV64I-NEXT: lbu a5, 11(a3)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: slli a4, a4, 16
-; RV64I-NEXT: slli a5, a5, 24
-; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: or a0, a4, a0
-; RV64I-NEXT: lbu a1, 13(a3)
-; RV64I-NEXT: lbu a4, 12(a3)
-; RV64I-NEXT: lbu a5, 14(a3)
-; RV64I-NEXT: lbu a6, 15(a3)
-; RV64I-NEXT: slli a1, a1, 8
-; RV64I-NEXT: or a1, a1, a4
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a4, a1, a0
-; RV64I-NEXT: andi a1, t0, 7
-; RV64I-NEXT: lbu a0, 17(a3)
-; RV64I-NEXT: lbu a5, 16(a3)
-; RV64I-NEXT: lbu a6, 18(a3)
-; RV64I-NEXT: lbu a7, 19(a3)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
-; RV64I-NEXT: or a0, a5, a0
-; RV64I-NEXT: lbu a5, 21(a3)
-; RV64I-NEXT: lbu a6, 20(a3)
-; RV64I-NEXT: lbu a7, 22(a3)
-; RV64I-NEXT: lbu t0, 23(a3)
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -1638,92 +1343,138 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
-; RV64I-NEXT: or a5, a5, a0
-; RV64I-NEXT: slli a0, a5, 1
-; RV64I-NEXT: not a6, a1
-; RV64I-NEXT: sll a0, a0, a6
-; RV64I-NEXT: lbu a6, 1(a3)
-; RV64I-NEXT: lbu a7, 0(a3)
-; RV64I-NEXT: lbu t0, 2(a3)
-; RV64I-NEXT: lbu t1, 3(a3)
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli t1, t1, 24
; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 5(a3)
-; RV64I-NEXT: lbu t0, 4(a3)
-; RV64I-NEXT: lbu t1, 6(a3)
-; RV64I-NEXT: lbu t2, 7(a3)
-; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or t0, t2, t1
-; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 25(a3)
-; RV64I-NEXT: lbu t0, 24(a3)
-; RV64I-NEXT: lbu t1, 26(a3)
-; RV64I-NEXT: lbu t2, 27(a3)
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or t0, t2, t1
-; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: lbu t0, 29(a3)
-; RV64I-NEXT: lbu t1, 28(a3)
-; RV64I-NEXT: lbu t2, 30(a3)
-; RV64I-NEXT: lbu a3, 31(a3)
-; RV64I-NEXT: slli t0, t0, 8
-; RV64I-NEXT: or t0, t0, t1
-; RV64I-NEXT: slli t2, t2, 16
-; RV64I-NEXT: slli a3, a3, 24
-; RV64I-NEXT: or a3, a3, t2
-; RV64I-NEXT: slli t1, a4, 1
-; RV64I-NEXT: or a3, a3, t0
-; RV64I-NEXT: xori t0, a1, 63
-; RV64I-NEXT: sll t1, t1, t0
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a7, a3, a7
-; RV64I-NEXT: slli a3, a7, 1
-; RV64I-NEXT: sll t0, a3, t0
-; RV64I-NEXT: srl a3, a4, a1
-; RV64I-NEXT: srl a4, a6, a1
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 56(sp)
+; RV64I-NEXT: sd zero, 48(sp)
+; RV64I-NEXT: sd zero, 40(sp)
+; RV64I-NEXT: sd zero, 32(sp)
+; RV64I-NEXT: sd a0, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: srl a0, a4, a1
+; RV64I-NEXT: ld a5, 16(a3)
+; RV64I-NEXT: andi a6, a1, 63
+; RV64I-NEXT: xori a6, a6, 63
+; RV64I-NEXT: ld a7, 0(a3)
+; RV64I-NEXT: slli t0, a5, 1
+; RV64I-NEXT: sll t0, t0, a6
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: srl a7, a7, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a6
+; RV64I-NEXT: or a4, a7, a4
; RV64I-NEXT: srl a5, a5, a1
-; RV64I-NEXT: srl a1, a7, a1
-; RV64I-NEXT: srli a6, a5, 48
-; RV64I-NEXT: sb a6, 22(a2)
-; RV64I-NEXT: srli a6, a5, 40
-; RV64I-NEXT: sb a6, 21(a2)
-; RV64I-NEXT: srli a6, a5, 32
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: srli a6, a5, 24
-; RV64I-NEXT: sb a6, 19(a2)
-; RV64I-NEXT: srli a6, a5, 16
-; RV64I-NEXT: sb a6, 18(a2)
-; RV64I-NEXT: or a6, a5, t0
-; RV64I-NEXT: sb a5, 16(a2)
-; RV64I-NEXT: srli a5, a5, 8
-; RV64I-NEXT: sb a5, 17(a2)
-; RV64I-NEXT: srli a5, a1, 56
-; RV64I-NEXT: sb a5, 31(a2)
-; RV64I-NEXT: srli a5, a1, 48
-; RV64I-NEXT: sb a5, 30(a2)
-; RV64I-NEXT: srli a5, a1, 40
-; RV64I-NEXT: sb a5, 29(a2)
-; RV64I-NEXT: srli a5, a1, 32
-; RV64I-NEXT: sb a5, 28(a2)
-; RV64I-NEXT: srli a5, a1, 24
-; RV64I-NEXT: sb a5, 27(a2)
-; RV64I-NEXT: srli a5, a1, 16
-; RV64I-NEXT: sb a5, 26(a2)
+; RV64I-NEXT: slli a7, a3, 1
+; RV64I-NEXT: sll a6, a7, a6
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: srl a1, a3, a1
; RV64I-NEXT: sb a1, 24(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 30(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 28(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 27(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 26(a2)
; RV64I-NEXT: srli a1, a1, 8
; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb a5, 16(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a1, a5, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a5, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a5, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a5, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a5, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a5, 16
+; RV64I-NEXT: sb a1, 18(a2)
+; RV64I-NEXT: srli a5, a5, 8
+; RV64I-NEXT: sb a5, 17(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
; RV64I-NEXT: srli a1, a4, 48
; RV64I-NEXT: sb a1, 6(a2)
; RV64I-NEXT: srli a1, a4, 40
@@ -1734,366 +1485,234 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: sb a1, 3(a2)
; RV64I-NEXT: srli a1, a4, 16
; RV64I-NEXT: sb a1, 2(a2)
-; RV64I-NEXT: or a1, a4, t1
-; RV64I-NEXT: sb a4, 0(a2)
; RV64I-NEXT: srli a4, a4, 8
; RV64I-NEXT: sb a4, 1(a2)
-; RV64I-NEXT: srli a4, a3, 48
-; RV64I-NEXT: sb a4, 14(a2)
-; RV64I-NEXT: srli a4, a3, 40
-; RV64I-NEXT: sb a4, 13(a2)
-; RV64I-NEXT: srli a4, a3, 32
-; RV64I-NEXT: sb a4, 12(a2)
-; RV64I-NEXT: srli a4, a3, 24
-; RV64I-NEXT: sb a4, 11(a2)
-; RV64I-NEXT: srli a4, a3, 16
-; RV64I-NEXT: sb a4, 10(a2)
-; RV64I-NEXT: or a0, a3, a0
-; RV64I-NEXT: sb a3, 8(a2)
-; RV64I-NEXT: srli a3, a3, 8
-; RV64I-NEXT: sb a3, 9(a2)
-; RV64I-NEXT: srli a3, a6, 56
-; RV64I-NEXT: sb a3, 23(a2)
-; RV64I-NEXT: srli a1, a1, 56
-; RV64I-NEXT: sb a1, 7(a2)
-; RV64I-NEXT: srli a0, a0, 56
-; RV64I-NEXT: sb a0, 15(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
; RV32I-LABEL: lshr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -64
; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
-; RV32I-NEXT: lbu s10, 1(a1)
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s11, 0(a1)
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: or s10, s10, s11
-; RV32I-NEXT: lbu s11, 22(a0)
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, ra
-; RV32I-NEXT: lbu ra, 23(a0)
-; RV32I-NEXT: or t0, a1, s10
-; RV32I-NEXT: lbu s10, 24(a0)
-; RV32I-NEXT: lbu a7, 25(a0)
-; RV32I-NEXT: lbu a6, 26(a0)
-; RV32I-NEXT: lbu a5, 27(a0)
-; RV32I-NEXT: lbu a1, 31(a0)
-; RV32I-NEXT: lbu a3, 30(a0)
-; RV32I-NEXT: lbu a4, 29(a0)
-; RV32I-NEXT: lbu a0, 28(a0)
-; RV32I-NEXT: sb a1, 59(sp)
-; RV32I-NEXT: sb a3, 58(sp)
-; RV32I-NEXT: sb a4, 57(sp)
-; RV32I-NEXT: sb a0, 56(sp)
-; RV32I-NEXT: sb a5, 55(sp)
-; RV32I-NEXT: sb a6, 54(sp)
-; RV32I-NEXT: sb a7, 53(sp)
-; RV32I-NEXT: sb s10, 52(sp)
-; RV32I-NEXT: sb ra, 51(sp)
-; RV32I-NEXT: sb s11, 50(sp)
-; RV32I-NEXT: sb s9, 49(sp)
-; RV32I-NEXT: sb s8, 48(sp)
-; RV32I-NEXT: sb s7, 47(sp)
-; RV32I-NEXT: sb s6, 46(sp)
-; RV32I-NEXT: sb s5, 45(sp)
-; RV32I-NEXT: sb s4, 44(sp)
-; RV32I-NEXT: sb zero, 91(sp)
-; RV32I-NEXT: sb zero, 90(sp)
-; RV32I-NEXT: sb zero, 89(sp)
-; RV32I-NEXT: sb zero, 88(sp)
-; RV32I-NEXT: sb zero, 87(sp)
-; RV32I-NEXT: sb zero, 86(sp)
-; RV32I-NEXT: sb zero, 85(sp)
-; RV32I-NEXT: sb zero, 84(sp)
-; RV32I-NEXT: sb zero, 83(sp)
-; RV32I-NEXT: sb zero, 82(sp)
-; RV32I-NEXT: sb zero, 81(sp)
-; RV32I-NEXT: sb zero, 80(sp)
-; RV32I-NEXT: sb zero, 79(sp)
-; RV32I-NEXT: sb zero, 78(sp)
-; RV32I-NEXT: sb zero, 77(sp)
-; RV32I-NEXT: sb zero, 76(sp)
-; RV32I-NEXT: sb zero, 75(sp)
-; RV32I-NEXT: sb zero, 74(sp)
-; RV32I-NEXT: sb zero, 73(sp)
-; RV32I-NEXT: sb zero, 72(sp)
-; RV32I-NEXT: sb zero, 71(sp)
-; RV32I-NEXT: sb zero, 70(sp)
-; RV32I-NEXT: sb zero, 69(sp)
-; RV32I-NEXT: sb zero, 68(sp)
-; RV32I-NEXT: sb zero, 67(sp)
-; RV32I-NEXT: sb zero, 66(sp)
-; RV32I-NEXT: sb zero, 65(sp)
-; RV32I-NEXT: sb zero, 64(sp)
-; RV32I-NEXT: sb zero, 63(sp)
-; RV32I-NEXT: sb zero, 62(sp)
-; RV32I-NEXT: sb zero, 61(sp)
-; RV32I-NEXT: sb zero, 60(sp)
-; RV32I-NEXT: sb s3, 43(sp)
-; RV32I-NEXT: sb s2, 42(sp)
-; RV32I-NEXT: sb s1, 41(sp)
-; RV32I-NEXT: sb s0, 40(sp)
-; RV32I-NEXT: sb t6, 39(sp)
-; RV32I-NEXT: sb t5, 38(sp)
-; RV32I-NEXT: sb t4, 37(sp)
-; RV32I-NEXT: sb t3, 36(sp)
-; RV32I-NEXT: sb t2, 35(sp)
-; RV32I-NEXT: sb t1, 34(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 33(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 32(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 30(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 29(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 28(sp)
-; RV32I-NEXT: slli a0, t0, 24
-; RV32I-NEXT: srli a0, a0, 27
-; RV32I-NEXT: addi a4, sp, 28
-; RV32I-NEXT: add a4, a4, a0
-; RV32I-NEXT: lbu a0, 5(a4)
-; RV32I-NEXT: lbu a1, 4(a4)
-; RV32I-NEXT: lbu a3, 6(a4)
-; RV32I-NEXT: lbu a5, 7(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: slli a3, a3, 16
-; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or t5, a3, a0
-; RV32I-NEXT: andi a3, t0, 7
-; RV32I-NEXT: lbu a0, 9(a4)
-; RV32I-NEXT: lbu a1, 8(a4)
-; RV32I-NEXT: lbu a5, 10(a4)
-; RV32I-NEXT: lbu a6, 11(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a1, a6, a5
-; RV32I-NEXT: or a6, a1, a0
-; RV32I-NEXT: slli a0, a6, 1
-; RV32I-NEXT: not t1, a3
-; RV32I-NEXT: sll a0, a0, t1
-; RV32I-NEXT: lbu a1, 1(a4)
-; RV32I-NEXT: lbu a5, 0(a4)
-; RV32I-NEXT: lbu a7, 2(a4)
-; RV32I-NEXT: lbu t0, 3(a4)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a5
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or t0, a5, a1
-; RV32I-NEXT: slli a1, t5, 1
-; RV32I-NEXT: xori t2, a3, 31
-; RV32I-NEXT: sll a1, a1, t2
-; RV32I-NEXT: lbu a5, 13(a4)
-; RV32I-NEXT: lbu a7, 12(a4)
-; RV32I-NEXT: lbu t3, 14(a4)
-; RV32I-NEXT: lbu t4, 15(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t1, t1, a7
+; RV32I-NEXT: lbu a7, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t2
; RV32I-NEXT: slli t3, t3, 16
; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t3, a7, a5
-; RV32I-NEXT: lbu a5, 17(a4)
-; RV32I-NEXT: lbu a7, 16(a4)
-; RV32I-NEXT: lbu t4, 18(a4)
-; RV32I-NEXT: lbu t6, 19(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t2, t2, a7
+; RV32I-NEXT: lbu a7, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a7, t6, t4
-; RV32I-NEXT: or t4, a7, a5
-; RV32I-NEXT: slli a5, t4, 1
-; RV32I-NEXT: sll a7, a5, t1
-; RV32I-NEXT: lbu a5, 21(a4)
-; RV32I-NEXT: lbu t6, 20(a4)
-; RV32I-NEXT: lbu s0, 22(a4)
-; RV32I-NEXT: lbu s1, 23(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, t6
-; RV32I-NEXT: slli s0, s0, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or s0, s0, a5
-; RV32I-NEXT: lbu a5, 25(a4)
-; RV32I-NEXT: lbu t6, 24(a4)
-; RV32I-NEXT: lbu s1, 26(a4)
-; RV32I-NEXT: lbu s2, 27(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, t6
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or t6, s2, s1
-; RV32I-NEXT: or t6, t6, a5
-; RV32I-NEXT: lbu a5, 29(a4)
-; RV32I-NEXT: lbu s1, 28(a4)
-; RV32I-NEXT: slli s2, t6, 1
-; RV32I-NEXT: sll t1, s2, t1
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, s1
-; RV32I-NEXT: lbu s1, 30(a4)
-; RV32I-NEXT: lbu a4, 31(a4)
-; RV32I-NEXT: slli s2, t3, 1
-; RV32I-NEXT: sll s2, s2, t2
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: slli s1, s0, 1
-; RV32I-NEXT: sll s1, s1, t2
-; RV32I-NEXT: or s3, a4, a5
-; RV32I-NEXT: slli a4, s3, 1
-; RV32I-NEXT: sll t2, a4, t2
-; RV32I-NEXT: srl a4, t5, a3
-; RV32I-NEXT: srl a5, t0, a3
-; RV32I-NEXT: srl t0, t3, a3
-; RV32I-NEXT: srl a6, a6, a3
-; RV32I-NEXT: srl t3, s0, a3
-; RV32I-NEXT: srl t4, t4, a3
-; RV32I-NEXT: srl t5, t6, a3
-; RV32I-NEXT: srl a3, s3, a3
-; RV32I-NEXT: srli t6, t5, 16
-; RV32I-NEXT: sb t6, 26(a2)
-; RV32I-NEXT: or t2, t5, t2
-; RV32I-NEXT: sb t5, 24(a2)
-; RV32I-NEXT: srli t5, t5, 8
-; RV32I-NEXT: sb t5, 25(a2)
-; RV32I-NEXT: srli t5, a3, 24
-; RV32I-NEXT: sb t5, 31(a2)
-; RV32I-NEXT: srli t5, a3, 16
-; RV32I-NEXT: sb t5, 30(a2)
-; RV32I-NEXT: sb a3, 28(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 29(a2)
-; RV32I-NEXT: srli a3, t4, 16
-; RV32I-NEXT: sb a3, 18(a2)
-; RV32I-NEXT: or a3, t4, s1
-; RV32I-NEXT: sb t4, 16(a2)
-; RV32I-NEXT: srli t4, t4, 8
-; RV32I-NEXT: sb t4, 17(a2)
-; RV32I-NEXT: srli t4, t3, 16
-; RV32I-NEXT: sb t4, 22(a2)
-; RV32I-NEXT: or t1, t3, t1
-; RV32I-NEXT: sb t3, 20(a2)
-; RV32I-NEXT: srli t3, t3, 8
-; RV32I-NEXT: sb t3, 21(a2)
-; RV32I-NEXT: srli t3, a6, 16
-; RV32I-NEXT: sb t3, 10(a2)
-; RV32I-NEXT: or t3, a6, s2
-; RV32I-NEXT: sb a6, 8(a2)
-; RV32I-NEXT: srli a6, a6, 8
-; RV32I-NEXT: sb a6, 9(a2)
-; RV32I-NEXT: srli a6, t0, 16
-; RV32I-NEXT: sb a6, 14(a2)
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: sb t0, 12(a2)
-; RV32I-NEXT: srli a7, t0, 8
-; RV32I-NEXT: sb a7, 13(a2)
-; RV32I-NEXT: srli a7, a5, 16
-; RV32I-NEXT: sb a7, 2(a2)
-; RV32I-NEXT: or a1, a5, a1
-; RV32I-NEXT: sb a5, 0(a2)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 1(a2)
-; RV32I-NEXT: srli a5, a4, 16
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: or a0, a4, a0
-; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, a7
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t3, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: or a7, a1, a7
+; RV32I-NEXT: sw zero, 60(sp)
+; RV32I-NEXT: sw zero, 56(sp)
+; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 44(sp)
+; RV32I-NEXT: sw zero, 40(sp)
+; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw t2, 24(sp)
+; RV32I-NEXT: sw t1, 20(sp)
+; RV32I-NEXT: sw t0, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a0, a7, 3
+; RV32I-NEXT: andi a0, a0, 28
+; RV32I-NEXT: mv a1, sp
+; RV32I-NEXT: add a4, a1, a0
+; RV32I-NEXT: lw a1, 4(a4)
+; RV32I-NEXT: srl a0, a1, a7
+; RV32I-NEXT: lw a5, 8(a4)
+; RV32I-NEXT: andi a3, a7, 31
+; RV32I-NEXT: xori a6, a3, 31
+; RV32I-NEXT: lw a3, 0(a4)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: srl a3, a3, a7
+; RV32I-NEXT: slli a1, a1, 1
+; RV32I-NEXT: lw t0, 12(a4)
+; RV32I-NEXT: lw t1, 16(a4)
+; RV32I-NEXT: sll a1, a1, a6
+; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: srl a3, t0, a7
+; RV32I-NEXT: slli t2, t1, 1
+; RV32I-NEXT: sll t2, t2, a6
+; RV32I-NEXT: or a3, a3, t2
+; RV32I-NEXT: srl a5, a5, a7
+; RV32I-NEXT: slli t0, t0, 1
+; RV32I-NEXT: lw t2, 20(a4)
+; RV32I-NEXT: lw t3, 24(a4)
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a5, a5, t0
+; RV32I-NEXT: srl t0, t2, a7
+; RV32I-NEXT: slli t4, t3, 1
+; RV32I-NEXT: sll t4, t4, a6
+; RV32I-NEXT: or t0, t0, t4
+; RV32I-NEXT: srl t1, t1, a7
+; RV32I-NEXT: slli t2, t2, 1
+; RV32I-NEXT: lw a4, 28(a4)
+; RV32I-NEXT: sll t2, t2, a6
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: srl t2, t3, a7
+; RV32I-NEXT: slli t3, a4, 1
+; RV32I-NEXT: sll a6, t3, a6
+; RV32I-NEXT: or a6, t2, a6
+; RV32I-NEXT: srl a4, a4, a7
+; RV32I-NEXT: sb a4, 28(a2)
+; RV32I-NEXT: srli a7, a4, 24
+; RV32I-NEXT: sb a7, 31(a2)
+; RV32I-NEXT: srli a7, a4, 16
+; RV32I-NEXT: sb a7, 30(a2)
; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 5(a2)
-; RV32I-NEXT: srli a4, t2, 24
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb t1, 16(a2)
+; RV32I-NEXT: sb t0, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a3, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a4, a6, 24
; RV32I-NEXT: sb a4, 27(a2)
-; RV32I-NEXT: srli a3, a3, 24
-; RV32I-NEXT: sb a3, 19(a2)
-; RV32I-NEXT: srli a3, t1, 24
-; RV32I-NEXT: sb a3, 23(a2)
-; RV32I-NEXT: srli a3, t3, 24
-; RV32I-NEXT: sb a3, 11(a2)
-; RV32I-NEXT: srli a3, a6, 24
-; RV32I-NEXT: sb a3, 15(a2)
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: sb a1, 3(a2)
-; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: srli a4, a6, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, a6, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, t1, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t1, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t1, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 21(a2)
+; RV32I-NEXT: srli a4, a5, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 15(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 14(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2104,191 +1723,43 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: shl_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -64
; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 1(a1)
-; RV64I-NEXT: lbu s10, 0(a1)
-; RV64I-NEXT: lbu s11, 2(a1)
-; RV64I-NEXT: lbu ra, 3(a1)
-; RV64I-NEXT: slli s9, s9, 8
-; RV64I-NEXT: or s9, s9, s10
-; RV64I-NEXT: slli s11, s11, 16
-; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: lbu s10, 5(a1)
-; RV64I-NEXT: or s11, ra, s11
-; RV64I-NEXT: or s11, s11, s9
-; RV64I-NEXT: lbu s9, 4(a1)
-; RV64I-NEXT: slli s10, s10, 8
-; RV64I-NEXT: lbu ra, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or s10, s10, s9
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: slli ra, ra, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, ra
-; RV64I-NEXT: lbu ra, 22(a0)
-; RV64I-NEXT: or a1, a1, s10
-; RV64I-NEXT: lbu s10, 23(a0)
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or t0, a1, s11
-; RV64I-NEXT: lbu s11, 24(a0)
-; RV64I-NEXT: lbu a7, 25(a0)
-; RV64I-NEXT: lbu a6, 26(a0)
-; RV64I-NEXT: lbu a5, 27(a0)
-; RV64I-NEXT: lbu a1, 31(a0)
-; RV64I-NEXT: lbu a3, 30(a0)
-; RV64I-NEXT: lbu a4, 29(a0)
-; RV64I-NEXT: lbu a0, 28(a0)
-; RV64I-NEXT: sb a1, 119(sp)
-; RV64I-NEXT: sb a3, 118(sp)
-; RV64I-NEXT: sb a4, 117(sp)
-; RV64I-NEXT: sb a0, 116(sp)
-; RV64I-NEXT: sb a5, 115(sp)
-; RV64I-NEXT: sb a6, 114(sp)
-; RV64I-NEXT: sb a7, 113(sp)
-; RV64I-NEXT: sb s11, 112(sp)
-; RV64I-NEXT: sb s10, 111(sp)
-; RV64I-NEXT: sb ra, 110(sp)
-; RV64I-NEXT: sb s9, 109(sp)
-; RV64I-NEXT: sb s8, 108(sp)
-; RV64I-NEXT: sb s7, 107(sp)
-; RV64I-NEXT: sb s6, 106(sp)
-; RV64I-NEXT: sb s5, 105(sp)
-; RV64I-NEXT: sb s4, 104(sp)
-; RV64I-NEXT: sb s3, 103(sp)
-; RV64I-NEXT: sb s2, 102(sp)
-; RV64I-NEXT: sb s1, 101(sp)
-; RV64I-NEXT: sb s0, 100(sp)
-; RV64I-NEXT: sb t6, 99(sp)
-; RV64I-NEXT: sb t5, 98(sp)
-; RV64I-NEXT: sb t4, 97(sp)
-; RV64I-NEXT: sb t3, 96(sp)
-; RV64I-NEXT: sb zero, 87(sp)
-; RV64I-NEXT: sb zero, 86(sp)
-; RV64I-NEXT: sb zero, 85(sp)
-; RV64I-NEXT: sb zero, 84(sp)
-; RV64I-NEXT: sb zero, 83(sp)
-; RV64I-NEXT: sb zero, 82(sp)
-; RV64I-NEXT: sb zero, 81(sp)
-; RV64I-NEXT: sb zero, 80(sp)
-; RV64I-NEXT: sb zero, 79(sp)
-; RV64I-NEXT: sb zero, 78(sp)
-; RV64I-NEXT: sb zero, 77(sp)
-; RV64I-NEXT: sb zero, 76(sp)
-; RV64I-NEXT: sb zero, 75(sp)
-; RV64I-NEXT: sb zero, 74(sp)
-; RV64I-NEXT: sb zero, 73(sp)
-; RV64I-NEXT: sb zero, 72(sp)
-; RV64I-NEXT: sb zero, 71(sp)
-; RV64I-NEXT: sb zero, 70(sp)
-; RV64I-NEXT: sb zero, 69(sp)
-; RV64I-NEXT: sb zero, 68(sp)
-; RV64I-NEXT: sb zero, 67(sp)
-; RV64I-NEXT: sb zero, 66(sp)
-; RV64I-NEXT: sb zero, 65(sp)
-; RV64I-NEXT: sb zero, 64(sp)
-; RV64I-NEXT: sb zero, 63(sp)
-; RV64I-NEXT: sb zero, 62(sp)
-; RV64I-NEXT: sb zero, 61(sp)
-; RV64I-NEXT: sb zero, 60(sp)
-; RV64I-NEXT: sb zero, 59(sp)
-; RV64I-NEXT: sb zero, 58(sp)
-; RV64I-NEXT: sb zero, 57(sp)
-; RV64I-NEXT: sb zero, 56(sp)
-; RV64I-NEXT: sb t2, 95(sp)
-; RV64I-NEXT: sb t1, 94(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 93(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 92(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 91(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 90(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 88(sp)
-; RV64I-NEXT: slli a0, t0, 56
-; RV64I-NEXT: srli a0, a0, 59
-; RV64I-NEXT: addi a1, sp, 88
-; RV64I-NEXT: sub a0, a1, a0
-; RV64I-NEXT: lbu a1, 9(a0)
-; RV64I-NEXT: lbu a3, 8(a0)
-; RV64I-NEXT: lbu a4, 10(a0)
-; RV64I-NEXT: lbu a5, 11(a0)
-; RV64I-NEXT: slli a1, a1, 8
-; RV64I-NEXT: or a1, a1, a3
-; RV64I-NEXT: slli a4, a4, 16
-; RV64I-NEXT: slli a5, a5, 24
-; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: lbu a3, 13(a0)
-; RV64I-NEXT: lbu a4, 12(a0)
-; RV64I-NEXT: lbu a5, 14(a0)
-; RV64I-NEXT: lbu a6, 15(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a3, a3, a1
-; RV64I-NEXT: andi a1, t0, 7
-; RV64I-NEXT: lbu a4, 1(a0)
-; RV64I-NEXT: lbu a5, 0(a0)
-; RV64I-NEXT: lbu a6, 2(a0)
-; RV64I-NEXT: lbu a7, 3(a0)
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 5(a0)
-; RV64I-NEXT: lbu a6, 4(a0)
-; RV64I-NEXT: lbu a7, 6(a0)
-; RV64I-NEXT: lbu t0, 7(a0)
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -2297,20 +1768,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 25(a0)
-; RV64I-NEXT: lbu a6, 24(a0)
-; RV64I-NEXT: lbu a7, 26(a0)
-; RV64I-NEXT: lbu t0, 27(a0)
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: slli t0, t0, 24
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 29(a0)
-; RV64I-NEXT: lbu a7, 28(a0)
-; RV64I-NEXT: lbu t0, 30(a0)
-; RV64I-NEXT: lbu t1, 31(a0)
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
@@ -2319,439 +1790,353 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: slli a6, a6, 32
; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 17(a0)
-; RV64I-NEXT: lbu a7, 16(a0)
-; RV64I-NEXT: lbu t0, 18(a0)
-; RV64I-NEXT: lbu t1, 19(a0)
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: lbu a7, 21(a0)
-; RV64I-NEXT: or t0, t1, t0
-; RV64I-NEXT: or a6, t0, a6
-; RV64I-NEXT: lbu t0, 20(a0)
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: lbu t1, 22(a0)
-; RV64I-NEXT: lbu a0, 23(a0)
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: srli t0, a4, 1
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or t1, a0, t1
-; RV64I-NEXT: xori t2, a1, 63
-; RV64I-NEXT: srl a0, t0, t2
-; RV64I-NEXT: or a7, t1, a7
-; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: srli a7, a6, 1
-; RV64I-NEXT: srl a7, a7, t2
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 24(sp)
+; RV64I-NEXT: sd zero, 16(sp)
+; RV64I-NEXT: sd zero, 8(sp)
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a5, 48(sp)
+; RV64I-NEXT: sd a4, 40(sp)
+; RV64I-NEXT: sd a3, 32(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: addi a3, sp, 32
+; RV64I-NEXT: sub a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: ld a5, 0(a3)
+; RV64I-NEXT: sll a0, a4, a1
+; RV64I-NEXT: andi a6, a1, 63
+; RV64I-NEXT: xori a6, a6, 63
+; RV64I-NEXT: srli a7, a5, 1
+; RV64I-NEXT: ld t0, 24(a3)
+; RV64I-NEXT: ld a3, 16(a3)
+; RV64I-NEXT: srl a7, a7, a6
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: sll a7, t0, a1
; RV64I-NEXT: srli t0, a3, 1
-; RV64I-NEXT: not t1, a1
-; RV64I-NEXT: srl t0, t0, t1
+; RV64I-NEXT: srl t0, t0, a6
+; RV64I-NEXT: or a7, a7, t0
; RV64I-NEXT: sll a3, a3, a1
-; RV64I-NEXT: sll a5, a5, a1
-; RV64I-NEXT: sll a6, a6, a1
-; RV64I-NEXT: sll a1, a4, a1
-; RV64I-NEXT: srli a4, a6, 56
-; RV64I-NEXT: sb a4, 23(a2)
-; RV64I-NEXT: srli a4, a6, 48
-; RV64I-NEXT: sb a4, 22(a2)
-; RV64I-NEXT: srli a4, a6, 40
-; RV64I-NEXT: sb a4, 21(a2)
-; RV64I-NEXT: srli a4, a6, 32
-; RV64I-NEXT: sb a4, 20(a2)
-; RV64I-NEXT: srli a4, a6, 24
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: srli a4, a6, 16
-; RV64I-NEXT: sb a4, 18(a2)
-; RV64I-NEXT: or a4, a6, t0
-; RV64I-NEXT: srli a6, a6, 8
-; RV64I-NEXT: sb a6, 17(a2)
-; RV64I-NEXT: srli a6, a5, 56
-; RV64I-NEXT: sb a6, 31(a2)
-; RV64I-NEXT: srli a6, a5, 48
-; RV64I-NEXT: sb a6, 30(a2)
-; RV64I-NEXT: srli a6, a5, 40
-; RV64I-NEXT: sb a6, 29(a2)
-; RV64I-NEXT: srli a6, a5, 32
-; RV64I-NEXT: sb a6, 28(a2)
-; RV64I-NEXT: srli a6, a5, 24
-; RV64I-NEXT: sb a6, 27(a2)
-; RV64I-NEXT: srli a6, a5, 16
-; RV64I-NEXT: sb a6, 26(a2)
-; RV64I-NEXT: or a6, a5, a7
-; RV64I-NEXT: srli a5, a5, 8
-; RV64I-NEXT: sb a5, 25(a2)
-; RV64I-NEXT: srli a5, a1, 56
-; RV64I-NEXT: sb a5, 7(a2)
-; RV64I-NEXT: srli a5, a1, 48
-; RV64I-NEXT: sb a5, 6(a2)
-; RV64I-NEXT: srli a5, a1, 40
-; RV64I-NEXT: sb a5, 5(a2)
-; RV64I-NEXT: srli a5, a1, 32
-; RV64I-NEXT: sb a5, 4(a2)
-; RV64I-NEXT: srli a5, a1, 24
-; RV64I-NEXT: sb a5, 3(a2)
-; RV64I-NEXT: srli a5, a1, 16
-; RV64I-NEXT: sb a5, 2(a2)
+; RV64I-NEXT: srli a4, a4, 1
+; RV64I-NEXT: srl a4, a4, a6
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: sll a1, a5, a1
; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: srli a4, a1, 56
+; RV64I-NEXT: sb a4, 7(a2)
+; RV64I-NEXT: srli a4, a1, 48
+; RV64I-NEXT: sb a4, 6(a2)
+; RV64I-NEXT: srli a4, a1, 40
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: srli a4, a1, 32
+; RV64I-NEXT: sb a4, 4(a2)
+; RV64I-NEXT: srli a4, a1, 24
+; RV64I-NEXT: sb a4, 3(a2)
+; RV64I-NEXT: srli a4, a1, 16
+; RV64I-NEXT: sb a4, 2(a2)
; RV64I-NEXT: srli a1, a1, 8
; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: sb a3, 16(a2)
+; RV64I-NEXT: sb a7, 24(a2)
+; RV64I-NEXT: sb a0, 8(a2)
; RV64I-NEXT: srli a1, a3, 56
-; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: sb a1, 23(a2)
; RV64I-NEXT: srli a1, a3, 48
-; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: sb a1, 22(a2)
; RV64I-NEXT: srli a1, a3, 40
-; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: sb a1, 21(a2)
; RV64I-NEXT: srli a1, a3, 32
-; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: sb a1, 20(a2)
; RV64I-NEXT: srli a1, a3, 24
-; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: sb a1, 19(a2)
; RV64I-NEXT: srli a1, a3, 16
-; RV64I-NEXT: sb a1, 10(a2)
-; RV64I-NEXT: or a0, a3, a0
+; RV64I-NEXT: sb a1, 18(a2)
; RV64I-NEXT: srli a3, a3, 8
-; RV64I-NEXT: sb a3, 9(a2)
-; RV64I-NEXT: sb a4, 16(a2)
-; RV64I-NEXT: sb a6, 24(a2)
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: srli a1, a7, 56
+; RV64I-NEXT: sb a1, 31(a2)
+; RV64I-NEXT: srli a1, a7, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, a7, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, a7, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, a7, 24
+; RV64I-NEXT: sb a1, 27(a2)
+; RV64I-NEXT: srli a1, a7, 16
+; RV64I-NEXT: sb a1, 26(a2)
+; RV64I-NEXT: srli a1, a7, 8
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
; RV32I-LABEL: shl_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -64
; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
-; RV32I-NEXT: lbu s10, 1(a1)
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s11, 0(a1)
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: or s10, s10, s11
-; RV32I-NEXT: lbu s11, 22(a0)
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, ra
-; RV32I-NEXT: lbu ra, 23(a0)
-; RV32I-NEXT: or t0, a1, s10
-; RV32I-NEXT: lbu s10, 24(a0)
-; RV32I-NEXT: lbu a7, 25(a0)
-; RV32I-NEXT: lbu a6, 26(a0)
-; RV32I-NEXT: lbu a5, 27(a0)
-; RV32I-NEXT: lbu a1, 31(a0)
-; RV32I-NEXT: lbu a3, 30(a0)
-; RV32I-NEXT: lbu a4, 29(a0)
-; RV32I-NEXT: lbu a0, 28(a0)
-; RV32I-NEXT: sb a1, 91(sp)
-; RV32I-NEXT: sb a3, 90(sp)
-; RV32I-NEXT: sb a4, 89(sp)
-; RV32I-NEXT: sb a0, 88(sp)
-; RV32I-NEXT: sb a5, 87(sp)
-; RV32I-NEXT: sb a6, 86(sp)
-; RV32I-NEXT: sb a7, 85(sp)
-; RV32I-NEXT: sb s10, 84(sp)
-; RV32I-NEXT: sb ra, 83(sp)
-; RV32I-NEXT: sb s11, 82(sp)
-; RV32I-NEXT: sb s9, 81(sp)
-; RV32I-NEXT: sb s8, 80(sp)
-; RV32I-NEXT: sb s7, 79(sp)
-; RV32I-NEXT: sb s6, 78(sp)
-; RV32I-NEXT: sb s5, 77(sp)
-; RV32I-NEXT: sb s4, 76(sp)
-; RV32I-NEXT: sb zero, 59(sp)
-; RV32I-NEXT: sb zero, 58(sp)
-; RV32I-NEXT: sb zero, 57(sp)
-; RV32I-NEXT: sb zero, 56(sp)
-; RV32I-NEXT: sb zero, 55(sp)
-; RV32I-NEXT: sb zero, 54(sp)
-; RV32I-NEXT: sb zero, 53(sp)
-; RV32I-NEXT: sb zero, 52(sp)
-; RV32I-NEXT: sb zero, 51(sp)
-; RV32I-NEXT: sb zero, 50(sp)
-; RV32I-NEXT: sb zero, 49(sp)
-; RV32I-NEXT: sb zero, 48(sp)
-; RV32I-NEXT: sb zero, 47(sp)
-; RV32I-NEXT: sb zero, 46(sp)
-; RV32I-NEXT: sb zero, 45(sp)
-; RV32I-NEXT: sb zero, 44(sp)
-; RV32I-NEXT: sb zero, 43(sp)
-; RV32I-NEXT: sb zero, 42(sp)
-; RV32I-NEXT: sb zero, 41(sp)
-; RV32I-NEXT: sb zero, 40(sp)
-; RV32I-NEXT: sb zero, 39(sp)
-; RV32I-NEXT: sb zero, 38(sp)
-; RV32I-NEXT: sb zero, 37(sp)
-; RV32I-NEXT: sb zero, 36(sp)
-; RV32I-NEXT: sb zero, 35(sp)
-; RV32I-NEXT: sb zero, 34(sp)
-; RV32I-NEXT: sb zero, 33(sp)
-; RV32I-NEXT: sb zero, 32(sp)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb s3, 75(sp)
-; RV32I-NEXT: sb s2, 74(sp)
-; RV32I-NEXT: sb s1, 73(sp)
-; RV32I-NEXT: sb s0, 72(sp)
-; RV32I-NEXT: sb t6, 71(sp)
-; RV32I-NEXT: sb t5, 70(sp)
-; RV32I-NEXT: sb t4, 69(sp)
-; RV32I-NEXT: sb t3, 68(sp)
-; RV32I-NEXT: sb t2, 67(sp)
-; RV32I-NEXT: sb t1, 66(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 65(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 64(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 63(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 62(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 61(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 60(sp)
-; RV32I-NEXT: slli a0, t0, 24
-; RV32I-NEXT: srli a0, a0, 27
-; RV32I-NEXT: addi a4, sp, 60
-; RV32I-NEXT: sub a4, a4, a0
-; RV32I-NEXT: lbu a0, 5(a4)
-; RV32I-NEXT: lbu a1, 4(a4)
-; RV32I-NEXT: lbu a3, 6(a4)
-; RV32I-NEXT: lbu a5, 7(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: slli a3, a3, 16
-; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or t5, a3, a0
-; RV32I-NEXT: andi a1, t0, 7
-; RV32I-NEXT: lbu a0, 1(a4)
-; RV32I-NEXT: lbu a3, 0(a4)
-; RV32I-NEXT: lbu a5, 2(a4)
-; RV32I-NEXT: lbu a6, 3(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a3, a6, a5
-; RV32I-NEXT: or a6, a3, a0
-; RV32I-NEXT: srli a0, a6, 1
-; RV32I-NEXT: xori a7, a1, 31
-; RV32I-NEXT: srl a0, a0, a7
-; RV32I-NEXT: lbu a3, 13(a4)
-; RV32I-NEXT: lbu a5, 12(a4)
-; RV32I-NEXT: lbu t0, 14(a4)
-; RV32I-NEXT: lbu t1, 15(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a5
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a5, t1, t0
-; RV32I-NEXT: or t0, a5, a3
-; RV32I-NEXT: lbu a3, 9(a4)
-; RV32I-NEXT: lbu a5, 8(a4)
-; RV32I-NEXT: lbu t1, 10(a4)
-; RV32I-NEXT: lbu t2, 11(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a5, t2, t1
-; RV32I-NEXT: or t1, a5, a3
-; RV32I-NEXT: srli a3, t1, 1
-; RV32I-NEXT: srl a5, a3, a7
-; RV32I-NEXT: srli t4, t5, 1
-; RV32I-NEXT: not t2, a1
-; RV32I-NEXT: lbu a3, 21(a4)
-; RV32I-NEXT: lbu t3, 20(a4)
-; RV32I-NEXT: lbu t6, 22(a4)
-; RV32I-NEXT: lbu s0, 23(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, t3
-; RV32I-NEXT: slli t6, t6, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or t3, s0, t6
-; RV32I-NEXT: or t3, t3, a3
-; RV32I-NEXT: lbu a3, 17(a4)
-; RV32I-NEXT: lbu t6, 16(a4)
-; RV32I-NEXT: lbu s0, 18(a4)
-; RV32I-NEXT: lbu s1, 19(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, t6
-; RV32I-NEXT: slli s0, s0, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or s0, s0, a3
-; RV32I-NEXT: lbu a3, 29(a4)
-; RV32I-NEXT: lbu t6, 28(a4)
-; RV32I-NEXT: lbu s1, 30(a4)
-; RV32I-NEXT: lbu s2, 31(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, t6
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or t6, s2, s1
-; RV32I-NEXT: lbu s1, 25(a4)
-; RV32I-NEXT: lbu s2, 24(a4)
-; RV32I-NEXT: srl t4, t4, t2
-; RV32I-NEXT: or t6, t6, a3
-; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: or a3, s1, s2
-; RV32I-NEXT: lbu s1, 26(a4)
-; RV32I-NEXT: lbu a4, 27(a4)
-; RV32I-NEXT: srli s2, s0, 1
-; RV32I-NEXT: srl s2, s2, a7
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: srli s1, t0, 1
-; RV32I-NEXT: srl s1, s1, t2
-; RV32I-NEXT: or a4, a4, a3
-; RV32I-NEXT: srli a3, a4, 1
-; RV32I-NEXT: srl a7, a3, a7
-; RV32I-NEXT: srli a3, t3, 1
-; RV32I-NEXT: srl t2, a3, t2
-; RV32I-NEXT: sll a3, t5, a1
-; RV32I-NEXT: sll t0, t0, a1
-; RV32I-NEXT: sll t1, t1, a1
-; RV32I-NEXT: sll t3, t3, a1
-; RV32I-NEXT: sll t5, s0, a1
-; RV32I-NEXT: sll t6, t6, a1
-; RV32I-NEXT: sll a4, a4, a1
-; RV32I-NEXT: sll a1, a6, a1
-; RV32I-NEXT: srli a6, a4, 24
-; RV32I-NEXT: sb a6, 27(a2)
-; RV32I-NEXT: srli a6, a4, 16
-; RV32I-NEXT: sb a6, 26(a2)
-; RV32I-NEXT: or a6, a4, t2
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t1, t1, a7
+; RV32I-NEXT: lbu a7, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t2, t2, a7
+; RV32I-NEXT: lbu a7, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, a7
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t3, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: or a7, a1, a7
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw t2, 56(sp)
+; RV32I-NEXT: sw t1, 52(sp)
+; RV32I-NEXT: sw t0, 48(sp)
+; RV32I-NEXT: sw a6, 44(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: srli a0, a7, 3
+; RV32I-NEXT: andi a0, a0, 28
+; RV32I-NEXT: addi a1, sp, 32
+; RV32I-NEXT: sub a4, a1, a0
+; RV32I-NEXT: lw a3, 4(a4)
+; RV32I-NEXT: lw a5, 0(a4)
+; RV32I-NEXT: sll a0, a3, a7
+; RV32I-NEXT: andi a1, a7, 31
+; RV32I-NEXT: xori a6, a1, 31
+; RV32I-NEXT: srli a1, a5, 1
+; RV32I-NEXT: lw t0, 12(a4)
+; RV32I-NEXT: lw t1, 8(a4)
+; RV32I-NEXT: srl a1, a1, a6
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: sll a1, t0, a7
+; RV32I-NEXT: srli t2, t1, 1
+; RV32I-NEXT: srl t2, t2, a6
+; RV32I-NEXT: or a1, a1, t2
+; RV32I-NEXT: sll t1, t1, a7
+; RV32I-NEXT: srli a3, a3, 1
+; RV32I-NEXT: lw t2, 20(a4)
+; RV32I-NEXT: lw t3, 16(a4)
+; RV32I-NEXT: srl a3, a3, a6
+; RV32I-NEXT: or a3, t1, a3
+; RV32I-NEXT: sll t1, t2, a7
+; RV32I-NEXT: srli t4, t3, 1
+; RV32I-NEXT: srl t4, t4, a6
+; RV32I-NEXT: or t1, t1, t4
+; RV32I-NEXT: sll t3, t3, a7
+; RV32I-NEXT: srli t0, t0, 1
+; RV32I-NEXT: lw t4, 28(a4)
+; RV32I-NEXT: lw a4, 24(a4)
+; RV32I-NEXT: srl t0, t0, a6
+; RV32I-NEXT: or t0, t3, t0
+; RV32I-NEXT: sll t3, t4, a7
+; RV32I-NEXT: srli t4, a4, 1
+; RV32I-NEXT: srl t4, t4, a6
+; RV32I-NEXT: or t3, t3, t4
+; RV32I-NEXT: sll a4, a4, a7
+; RV32I-NEXT: srli t2, t2, 1
+; RV32I-NEXT: srl a6, t2, a6
+; RV32I-NEXT: or a4, a4, a6
+; RV32I-NEXT: sll a5, a5, a7
+; RV32I-NEXT: sb a5, 0(a2)
+; RV32I-NEXT: srli a6, a5, 24
+; RV32I-NEXT: sb a6, 3(a2)
+; RV32I-NEXT: srli a6, a5, 16
+; RV32I-NEXT: sb a6, 2(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 1(a2)
+; RV32I-NEXT: sb a4, 24(a2)
+; RV32I-NEXT: sb t3, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb t1, 20(a2)
+; RV32I-NEXT: sb a3, 8(a2)
+; RV32I-NEXT: sb a1, 12(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a5, a4, 24
+; RV32I-NEXT: sb a5, 27(a2)
+; RV32I-NEXT: srli a5, a4, 16
+; RV32I-NEXT: sb a5, 26(a2)
; RV32I-NEXT: srli a4, a4, 8
; RV32I-NEXT: sb a4, 25(a2)
-; RV32I-NEXT: srli a4, t6, 24
+; RV32I-NEXT: srli a4, t3, 24
; RV32I-NEXT: sb a4, 31(a2)
-; RV32I-NEXT: srli a4, t6, 16
+; RV32I-NEXT: srli a4, t3, 16
; RV32I-NEXT: sb a4, 30(a2)
-; RV32I-NEXT: or a4, t6, a7
-; RV32I-NEXT: srli a7, t6, 8
-; RV32I-NEXT: sb a7, 29(a2)
-; RV32I-NEXT: srli a7, t5, 24
-; RV32I-NEXT: sb a7, 19(a2)
-; RV32I-NEXT: srli a7, t5, 16
-; RV32I-NEXT: sb a7, 18(a2)
-; RV32I-NEXT: or a7, t5, s1
-; RV32I-NEXT: srli t2, t5, 8
-; RV32I-NEXT: sb t2, 17(a2)
-; RV32I-NEXT: srli t2, t3, 24
-; RV32I-NEXT: sb t2, 23(a2)
-; RV32I-NEXT: srli t2, t3, 16
-; RV32I-NEXT: sb t2, 22(a2)
-; RV32I-NEXT: or t2, t3, s2
-; RV32I-NEXT: srli t3, t3, 8
-; RV32I-NEXT: sb t3, 21(a2)
-; RV32I-NEXT: srli t3, t1, 24
-; RV32I-NEXT: sb t3, 11(a2)
-; RV32I-NEXT: srli t3, t1, 16
-; RV32I-NEXT: sb t3, 10(a2)
-; RV32I-NEXT: or t3, t1, t4
-; RV32I-NEXT: srli t1, t1, 8
-; RV32I-NEXT: sb t1, 9(a2)
-; RV32I-NEXT: srli t1, t0, 24
-; RV32I-NEXT: sb t1, 15(a2)
-; RV32I-NEXT: srli t1, t0, 16
-; RV32I-NEXT: sb t1, 14(a2)
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: srli t0, t0, 8
-; RV32I-NEXT: sb t0, 13(a2)
-; RV32I-NEXT: srli t0, a1, 24
-; RV32I-NEXT: sb t0, 3(a2)
-; RV32I-NEXT: srli t0, a1, 16
-; RV32I-NEXT: sb t0, 2(a2)
-; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: srli a4, t3, 8
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, t1, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, t1, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli a4, t1, 8
+; RV32I-NEXT: sb a4, 21(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 9(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 14(a2)
; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 1(a2)
-; RV32I-NEXT: srli a1, a3, 24
+; RV32I-NEXT: sb a1, 13(a2)
+; RV32I-NEXT: srli a1, a0, 24
; RV32I-NEXT: sb a1, 7(a2)
-; RV32I-NEXT: srli a1, a3, 16
+; RV32I-NEXT: srli a1, a0, 16
; RV32I-NEXT: sb a1, 6(a2)
-; RV32I-NEXT: or a0, a3, a0
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 5(a2)
-; RV32I-NEXT: sb a6, 24(a2)
-; RV32I-NEXT: sb a4, 28(a2)
-; RV32I-NEXT: sb a7, 16(a2)
-; RV32I-NEXT: sb t2, 20(a2)
-; RV32I-NEXT: sb t3, 8(a2)
-; RV32I-NEXT: sb a5, 12(a2)
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2762,200 +2147,43 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: ashr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 31(a0)
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -64
; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t3, 6(a0)
-; RV64I-NEXT: lbu t4, 7(a0)
-; RV64I-NEXT: lbu t5, 8(a0)
-; RV64I-NEXT: lbu t6, 9(a0)
-; RV64I-NEXT: lbu s0, 10(a0)
-; RV64I-NEXT: lbu s1, 11(a0)
-; RV64I-NEXT: lbu s2, 12(a0)
-; RV64I-NEXT: lbu s3, 13(a0)
-; RV64I-NEXT: lbu s4, 14(a0)
-; RV64I-NEXT: lbu s5, 15(a0)
-; RV64I-NEXT: lbu s6, 16(a0)
-; RV64I-NEXT: lbu s7, 17(a0)
-; RV64I-NEXT: lbu s8, 18(a0)
-; RV64I-NEXT: lbu s9, 19(a0)
-; RV64I-NEXT: lbu a3, 1(a1)
-; RV64I-NEXT: lbu s10, 0(a1)
-; RV64I-NEXT: lbu s11, 2(a1)
-; RV64I-NEXT: lbu ra, 3(a1)
-; RV64I-NEXT: slli a3, a3, 8
-; RV64I-NEXT: or a3, a3, s10
-; RV64I-NEXT: slli s11, s11, 16
-; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: lbu s10, 5(a1)
-; RV64I-NEXT: or s11, ra, s11
-; RV64I-NEXT: or a3, s11, a3
-; RV64I-NEXT: lbu s11, 4(a1)
-; RV64I-NEXT: slli s10, s10, 8
-; RV64I-NEXT: lbu ra, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or s10, s10, s11
-; RV64I-NEXT: lbu s11, 20(a0)
-; RV64I-NEXT: slli ra, ra, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, ra
-; RV64I-NEXT: lbu ra, 21(a0)
-; RV64I-NEXT: or a1, a1, s10
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or t2, a1, a3
-; RV64I-NEXT: lbu t0, 23(a0)
-; RV64I-NEXT: lbu a7, 24(a0)
-; RV64I-NEXT: lbu a6, 25(a0)
-; RV64I-NEXT: lbu a5, 26(a0)
-; RV64I-NEXT: lbu a1, 30(a0)
-; RV64I-NEXT: lbu a3, 29(a0)
-; RV64I-NEXT: lbu a4, 28(a0)
-; RV64I-NEXT: lbu a0, 27(a0)
-; RV64I-NEXT: sb a1, 86(sp)
-; RV64I-NEXT: sb a3, 85(sp)
-; RV64I-NEXT: sb a4, 84(sp)
-; RV64I-NEXT: sb a0, 83(sp)
-; RV64I-NEXT: sb a5, 82(sp)
-; RV64I-NEXT: sb a6, 81(sp)
-; RV64I-NEXT: sb a7, 80(sp)
-; RV64I-NEXT: sb t0, 79(sp)
-; RV64I-NEXT: sb s10, 78(sp)
-; RV64I-NEXT: sb ra, 77(sp)
-; RV64I-NEXT: sb s11, 76(sp)
-; RV64I-NEXT: sb s9, 75(sp)
-; RV64I-NEXT: sb s8, 74(sp)
-; RV64I-NEXT: sb s7, 73(sp)
-; RV64I-NEXT: sb s6, 72(sp)
-; RV64I-NEXT: sb s5, 71(sp)
-; RV64I-NEXT: sb s4, 70(sp)
-; RV64I-NEXT: sb s3, 69(sp)
-; RV64I-NEXT: sb s2, 68(sp)
-; RV64I-NEXT: sb s1, 67(sp)
-; RV64I-NEXT: sb s0, 66(sp)
-; RV64I-NEXT: sb t6, 65(sp)
-; RV64I-NEXT: sb t5, 64(sp)
-; RV64I-NEXT: sb t1, 87(sp)
-; RV64I-NEXT: slli t1, t1, 56
-; RV64I-NEXT: sb t4, 63(sp)
-; RV64I-NEXT: sb t3, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: srai a0, t1, 63
-; RV64I-NEXT: sb a0, 112(sp)
-; RV64I-NEXT: sb a0, 104(sp)
-; RV64I-NEXT: sb a0, 96(sp)
-; RV64I-NEXT: sb a0, 88(sp)
-; RV64I-NEXT: srli a1, a0, 56
-; RV64I-NEXT: sb a1, 119(sp)
-; RV64I-NEXT: srli a3, a0, 48
-; RV64I-NEXT: sb a3, 118(sp)
-; RV64I-NEXT: srli a4, a0, 40
-; RV64I-NEXT: sb a4, 117(sp)
-; RV64I-NEXT: srli a5, a0, 32
-; RV64I-NEXT: sb a5, 116(sp)
-; RV64I-NEXT: srli a6, a0, 24
-; RV64I-NEXT: sb a6, 115(sp)
-; RV64I-NEXT: srli a7, a0, 16
-; RV64I-NEXT: sb a7, 114(sp)
-; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: sb a0, 113(sp)
-; RV64I-NEXT: sb a1, 111(sp)
-; RV64I-NEXT: sb a3, 110(sp)
-; RV64I-NEXT: sb a4, 109(sp)
-; RV64I-NEXT: sb a5, 108(sp)
-; RV64I-NEXT: sb a6, 107(sp)
-; RV64I-NEXT: sb a7, 106(sp)
-; RV64I-NEXT: sb a0, 105(sp)
-; RV64I-NEXT: sb a1, 103(sp)
-; RV64I-NEXT: sb a3, 102(sp)
-; RV64I-NEXT: sb a4, 101(sp)
-; RV64I-NEXT: sb a5, 100(sp)
-; RV64I-NEXT: sb a6, 99(sp)
-; RV64I-NEXT: sb a7, 98(sp)
-; RV64I-NEXT: sb a0, 97(sp)
-; RV64I-NEXT: sb a1, 95(sp)
-; RV64I-NEXT: sb a3, 94(sp)
-; RV64I-NEXT: sb a4, 93(sp)
-; RV64I-NEXT: sb a5, 92(sp)
-; RV64I-NEXT: sb a6, 91(sp)
-; RV64I-NEXT: sb a7, 90(sp)
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: slli a0, t2, 56
-; RV64I-NEXT: srli a0, a0, 59
-; RV64I-NEXT: addi a1, sp, 56
-; RV64I-NEXT: add a1, a1, a0
-; RV64I-NEXT: lbu a0, 9(a1)
-; RV64I-NEXT: lbu a3, 8(a1)
-; RV64I-NEXT: lbu a4, 10(a1)
-; RV64I-NEXT: lbu a5, 11(a1)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a3
-; RV64I-NEXT: slli a4, a4, 16
-; RV64I-NEXT: slli a5, a5, 24
-; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: or a0, a4, a0
-; RV64I-NEXT: lbu a3, 13(a1)
-; RV64I-NEXT: lbu a4, 12(a1)
-; RV64I-NEXT: lbu a5, 14(a1)
-; RV64I-NEXT: lbu a6, 15(a1)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a4, a3, a0
-; RV64I-NEXT: andi a3, t2, 7
-; RV64I-NEXT: lbu a0, 17(a1)
-; RV64I-NEXT: lbu a5, 16(a1)
-; RV64I-NEXT: lbu a6, 18(a1)
-; RV64I-NEXT: lbu a7, 19(a1)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
-; RV64I-NEXT: or a0, a5, a0
-; RV64I-NEXT: lbu a5, 21(a1)
-; RV64I-NEXT: lbu a6, 20(a1)
-; RV64I-NEXT: lbu a7, 22(a1)
-; RV64I-NEXT: lbu t0, 23(a1)
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -2963,467 +2191,378 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
-; RV64I-NEXT: or a5, a5, a0
-; RV64I-NEXT: slli a0, a5, 1
-; RV64I-NEXT: not a6, a3
-; RV64I-NEXT: sll a0, a0, a6
-; RV64I-NEXT: lbu a6, 1(a1)
-; RV64I-NEXT: lbu a7, 0(a1)
-; RV64I-NEXT: lbu t0, 2(a1)
-; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli t1, t1, 24
; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 5(a1)
-; RV64I-NEXT: lbu t0, 4(a1)
-; RV64I-NEXT: lbu t1, 6(a1)
-; RV64I-NEXT: lbu t2, 7(a1)
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or t0, t2, t1
-; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a7, a0, 32
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 25(a1)
-; RV64I-NEXT: lbu t0, 24(a1)
-; RV64I-NEXT: lbu t1, 26(a1)
-; RV64I-NEXT: lbu t2, 27(a1)
+; RV64I-NEXT: lbu a7, 1(a1)
+; RV64I-NEXT: lbu t0, 0(a1)
+; RV64I-NEXT: lbu t1, 2(a1)
+; RV64I-NEXT: lbu t2, 3(a1)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli t2, t2, 24
; RV64I-NEXT: or t0, t2, t1
; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: lbu t0, 29(a1)
-; RV64I-NEXT: lbu t1, 28(a1)
-; RV64I-NEXT: lbu t2, 30(a1)
-; RV64I-NEXT: lbu a1, 31(a1)
+; RV64I-NEXT: lbu t0, 5(a1)
+; RV64I-NEXT: lbu t1, 4(a1)
+; RV64I-NEXT: lbu t2, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: or t0, t0, t1
; RV64I-NEXT: slli t2, t2, 16
; RV64I-NEXT: slli a1, a1, 24
; RV64I-NEXT: or a1, a1, t2
-; RV64I-NEXT: slli t1, a4, 1
; RV64I-NEXT: or a1, a1, t0
-; RV64I-NEXT: xori t0, a3, 63
-; RV64I-NEXT: sll t1, t1, t0
; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a7, a1, a7
-; RV64I-NEXT: slli a1, a7, 1
-; RV64I-NEXT: sll t0, a1, t0
-; RV64I-NEXT: srl a1, a4, a3
-; RV64I-NEXT: srl a4, a6, a3
-; RV64I-NEXT: srl a5, a5, a3
-; RV64I-NEXT: sra a3, a7, a3
-; RV64I-NEXT: srli a6, a5, 48
-; RV64I-NEXT: sb a6, 22(a2)
-; RV64I-NEXT: srli a6, a5, 40
-; RV64I-NEXT: sb a6, 21(a2)
-; RV64I-NEXT: srli a6, a5, 32
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: srli a6, a5, 24
-; RV64I-NEXT: sb a6, 19(a2)
-; RV64I-NEXT: srli a6, a5, 16
-; RV64I-NEXT: sb a6, 18(a2)
-; RV64I-NEXT: or a6, a5, t0
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: sraiw a0, a0, 31
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a0, 40(sp)
+; RV64I-NEXT: sd a0, 32(sp)
+; RV64I-NEXT: sd a6, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: srl a0, a4, a1
+; RV64I-NEXT: ld a5, 16(a3)
+; RV64I-NEXT: andi a6, a1, 63
+; RV64I-NEXT: xori a6, a6, 63
+; RV64I-NEXT: ld a7, 0(a3)
+; RV64I-NEXT: slli t0, a5, 1
+; RV64I-NEXT: sll t0, t0, a6
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: srl a7, a7, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a6
+; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: srl a5, a5, a1
+; RV64I-NEXT: slli a7, a3, 1
+; RV64I-NEXT: sll a6, a7, a6
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: sra a1, a3, a1
+; RV64I-NEXT: sb a1, 24(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 30(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 28(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 27(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 26(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 25(a2)
; RV64I-NEXT: sb a5, 16(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a1, a5, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a5, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a5, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a5, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a5, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a5, 16
+; RV64I-NEXT: sb a1, 18(a2)
; RV64I-NEXT: srli a5, a5, 8
; RV64I-NEXT: sb a5, 17(a2)
-; RV64I-NEXT: srli a5, a3, 56
-; RV64I-NEXT: sb a5, 31(a2)
-; RV64I-NEXT: srli a5, a3, 48
-; RV64I-NEXT: sb a5, 30(a2)
-; RV64I-NEXT: srli a5, a3, 40
-; RV64I-NEXT: sb a5, 29(a2)
-; RV64I-NEXT: srli a5, a3, 32
-; RV64I-NEXT: sb a5, 28(a2)
-; RV64I-NEXT: srli a5, a3, 24
-; RV64I-NEXT: sb a5, 27(a2)
-; RV64I-NEXT: srli a5, a3, 16
-; RV64I-NEXT: sb a5, 26(a2)
-; RV64I-NEXT: sb a3, 24(a2)
-; RV64I-NEXT: srli a3, a3, 8
-; RV64I-NEXT: sb a3, 25(a2)
-; RV64I-NEXT: srli a3, a4, 48
-; RV64I-NEXT: sb a3, 6(a2)
-; RV64I-NEXT: srli a3, a4, 40
-; RV64I-NEXT: sb a3, 5(a2)
-; RV64I-NEXT: srli a3, a4, 32
-; RV64I-NEXT: sb a3, 4(a2)
-; RV64I-NEXT: srli a3, a4, 24
-; RV64I-NEXT: sb a3, 3(a2)
-; RV64I-NEXT: srli a3, a4, 16
-; RV64I-NEXT: sb a3, 2(a2)
-; RV64I-NEXT: or a3, a4, t1
-; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
; RV64I-NEXT: srli a4, a4, 8
; RV64I-NEXT: sb a4, 1(a2)
-; RV64I-NEXT: srli a4, a1, 48
-; RV64I-NEXT: sb a4, 14(a2)
-; RV64I-NEXT: srli a4, a1, 40
-; RV64I-NEXT: sb a4, 13(a2)
-; RV64I-NEXT: srli a4, a1, 32
-; RV64I-NEXT: sb a4, 12(a2)
-; RV64I-NEXT: srli a4, a1, 24
-; RV64I-NEXT: sb a4, 11(a2)
-; RV64I-NEXT: srli a4, a1, 16
-; RV64I-NEXT: sb a4, 10(a2)
-; RV64I-NEXT: or a0, a1, a0
-; RV64I-NEXT: sb a1, 8(a2)
-; RV64I-NEXT: srli a1, a1, 8
-; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: srli a1, a6, 56
-; RV64I-NEXT: sb a1, 23(a2)
-; RV64I-NEXT: srli a3, a3, 56
-; RV64I-NEXT: sb a3, 7(a2)
-; RV64I-NEXT: srli a0, a0, 56
-; RV64I-NEXT: sb a0, 15(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
; RV32I-LABEL: ashr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t3, 31(a0)
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -64
; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t2, 6(a0)
-; RV32I-NEXT: lbu t4, 7(a0)
-; RV32I-NEXT: lbu t5, 8(a0)
-; RV32I-NEXT: lbu t6, 9(a0)
-; RV32I-NEXT: lbu s0, 10(a0)
-; RV32I-NEXT: lbu s1, 11(a0)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s5, 15(a0)
-; RV32I-NEXT: lbu s6, 16(a0)
-; RV32I-NEXT: lbu s7, 17(a0)
-; RV32I-NEXT: lbu s8, 18(a0)
-; RV32I-NEXT: lbu a3, 1(a1)
-; RV32I-NEXT: lbu s9, 19(a0)
-; RV32I-NEXT: lbu s10, 20(a0)
-; RV32I-NEXT: lbu s11, 0(a1)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: or a3, a3, s11
-; RV32I-NEXT: lbu s11, 21(a0)
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, ra
-; RV32I-NEXT: lbu ra, 22(a0)
-; RV32I-NEXT: or t1, a1, a3
-; RV32I-NEXT: lbu t0, 23(a0)
-; RV32I-NEXT: lbu a7, 24(a0)
-; RV32I-NEXT: lbu a6, 25(a0)
-; RV32I-NEXT: lbu a5, 26(a0)
-; RV32I-NEXT: lbu a1, 30(a0)
-; RV32I-NEXT: lbu a3, 29(a0)
-; RV32I-NEXT: lbu a4, 28(a0)
-; RV32I-NEXT: lbu a0, 27(a0)
-; RV32I-NEXT: sb a1, 58(sp)
-; RV32I-NEXT: sb a3, 57(sp)
-; RV32I-NEXT: sb a4, 56(sp)
-; RV32I-NEXT: sb a0, 55(sp)
-; RV32I-NEXT: sb a5, 54(sp)
-; RV32I-NEXT: sb a6, 53(sp)
-; RV32I-NEXT: sb a7, 52(sp)
-; RV32I-NEXT: sb t0, 51(sp)
-; RV32I-NEXT: sb ra, 50(sp)
-; RV32I-NEXT: sb s11, 49(sp)
-; RV32I-NEXT: sb s10, 48(sp)
-; RV32I-NEXT: sb s9, 47(sp)
-; RV32I-NEXT: sb s8, 46(sp)
-; RV32I-NEXT: sb s7, 45(sp)
-; RV32I-NEXT: sb s6, 44(sp)
-; RV32I-NEXT: sb s5, 43(sp)
-; RV32I-NEXT: sb t3, 59(sp)
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: sb s4, 42(sp)
-; RV32I-NEXT: sb s3, 41(sp)
-; RV32I-NEXT: sb s2, 40(sp)
-; RV32I-NEXT: sb s1, 39(sp)
-; RV32I-NEXT: sb s0, 38(sp)
-; RV32I-NEXT: sb t6, 37(sp)
-; RV32I-NEXT: sb t5, 36(sp)
-; RV32I-NEXT: sb t4, 35(sp)
-; RV32I-NEXT: sb t2, 34(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 33(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 32(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 30(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 29(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 28(sp)
-; RV32I-NEXT: srai a0, t3, 31
-; RV32I-NEXT: sb a0, 88(sp)
-; RV32I-NEXT: sb a0, 84(sp)
-; RV32I-NEXT: sb a0, 80(sp)
-; RV32I-NEXT: sb a0, 76(sp)
-; RV32I-NEXT: sb a0, 72(sp)
-; RV32I-NEXT: sb a0, 68(sp)
-; RV32I-NEXT: sb a0, 64(sp)
-; RV32I-NEXT: sb a0, 60(sp)
-; RV32I-NEXT: srli a1, a0, 24
-; RV32I-NEXT: sb a1, 91(sp)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 90(sp)
-; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 89(sp)
-; RV32I-NEXT: sb a1, 87(sp)
-; RV32I-NEXT: sb a3, 86(sp)
-; RV32I-NEXT: sb a0, 85(sp)
-; RV32I-NEXT: sb a1, 83(sp)
-; RV32I-NEXT: sb a3, 82(sp)
-; RV32I-NEXT: sb a0, 81(sp)
-; RV32I-NEXT: sb a1, 79(sp)
-; RV32I-NEXT: sb a3, 78(sp)
-; RV32I-NEXT: sb a0, 77(sp)
-; RV32I-NEXT: sb a1, 75(sp)
-; RV32I-NEXT: sb a3, 74(sp)
-; RV32I-NEXT: sb a0, 73(sp)
-; RV32I-NEXT: sb a1, 71(sp)
-; RV32I-NEXT: sb a3, 70(sp)
-; RV32I-NEXT: sb a0, 69(sp)
-; RV32I-NEXT: sb a1, 67(sp)
-; RV32I-NEXT: sb a3, 66(sp)
-; RV32I-NEXT: sb a0, 65(sp)
-; RV32I-NEXT: sb a1, 63(sp)
-; RV32I-NEXT: sb a3, 62(sp)
-; RV32I-NEXT: sb a0, 61(sp)
-; RV32I-NEXT: slli a0, t1, 24
-; RV32I-NEXT: srli a0, a0, 27
-; RV32I-NEXT: addi a4, sp, 28
-; RV32I-NEXT: add a4, a4, a0
-; RV32I-NEXT: lbu a0, 5(a4)
-; RV32I-NEXT: lbu a1, 4(a4)
-; RV32I-NEXT: lbu a3, 6(a4)
-; RV32I-NEXT: lbu a5, 7(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: slli a3, a3, 16
-; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or t5, a3, a0
-; RV32I-NEXT: andi a3, t1, 7
-; RV32I-NEXT: lbu a0, 9(a4)
-; RV32I-NEXT: lbu a1, 8(a4)
-; RV32I-NEXT: lbu a5, 10(a4)
-; RV32I-NEXT: lbu a6, 11(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a1, a6, a5
-; RV32I-NEXT: or a6, a1, a0
-; RV32I-NEXT: slli a0, a6, 1
-; RV32I-NEXT: not t1, a3
-; RV32I-NEXT: sll a0, a0, t1
-; RV32I-NEXT: lbu a1, 1(a4)
-; RV32I-NEXT: lbu a5, 0(a4)
-; RV32I-NEXT: lbu a7, 2(a4)
-; RV32I-NEXT: lbu t0, 3(a4)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a5
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or t0, a5, a1
-; RV32I-NEXT: slli a1, t5, 1
-; RV32I-NEXT: xori t2, a3, 31
-; RV32I-NEXT: sll a1, a1, t2
-; RV32I-NEXT: lbu a5, 13(a4)
-; RV32I-NEXT: lbu a7, 12(a4)
-; RV32I-NEXT: lbu t3, 14(a4)
-; RV32I-NEXT: lbu t4, 15(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t1, t1, a7
+; RV32I-NEXT: lbu a7, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t2
; RV32I-NEXT: slli t3, t3, 16
; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t3, a7, a5
-; RV32I-NEXT: lbu a5, 17(a4)
-; RV32I-NEXT: lbu a7, 16(a4)
-; RV32I-NEXT: lbu t4, 18(a4)
-; RV32I-NEXT: lbu t6, 19(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t2, t2, a7
+; RV32I-NEXT: lbu a7, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a7, t6, t4
-; RV32I-NEXT: or t4, a7, a5
-; RV32I-NEXT: slli a5, t4, 1
-; RV32I-NEXT: sll a7, a5, t1
-; RV32I-NEXT: lbu a5, 21(a4)
-; RV32I-NEXT: lbu t6, 20(a4)
-; RV32I-NEXT: lbu s0, 22(a4)
-; RV32I-NEXT: lbu s1, 23(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, t6
-; RV32I-NEXT: slli s0, s0, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or s0, s0, a5
-; RV32I-NEXT: lbu a5, 25(a4)
-; RV32I-NEXT: lbu t6, 24(a4)
-; RV32I-NEXT: lbu s1, 26(a4)
-; RV32I-NEXT: lbu s2, 27(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, t6
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or t6, s2, s1
-; RV32I-NEXT: or t6, t6, a5
-; RV32I-NEXT: lbu a5, 29(a4)
-; RV32I-NEXT: lbu s1, 28(a4)
-; RV32I-NEXT: slli s2, t6, 1
-; RV32I-NEXT: sll t1, s2, t1
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, s1
-; RV32I-NEXT: lbu s1, 30(a4)
-; RV32I-NEXT: lbu a4, 31(a4)
-; RV32I-NEXT: slli s2, t3, 1
-; RV32I-NEXT: sll s2, s2, t2
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: slli s1, s0, 1
-; RV32I-NEXT: sll s1, s1, t2
-; RV32I-NEXT: or s3, a4, a5
-; RV32I-NEXT: slli a4, s3, 1
-; RV32I-NEXT: sll t2, a4, t2
-; RV32I-NEXT: srl a4, t5, a3
-; RV32I-NEXT: srl a5, t0, a3
-; RV32I-NEXT: srl t0, t3, a3
-; RV32I-NEXT: srl a6, a6, a3
-; RV32I-NEXT: srl t3, s0, a3
-; RV32I-NEXT: srl t4, t4, a3
-; RV32I-NEXT: srl t5, t6, a3
-; RV32I-NEXT: sra a3, s3, a3
-; RV32I-NEXT: srli t6, t5, 16
-; RV32I-NEXT: sb t6, 26(a2)
-; RV32I-NEXT: or t2, t5, t2
-; RV32I-NEXT: sb t5, 24(a2)
-; RV32I-NEXT: srli t5, t5, 8
-; RV32I-NEXT: sb t5, 25(a2)
-; RV32I-NEXT: srli t5, a3, 24
-; RV32I-NEXT: sb t5, 31(a2)
-; RV32I-NEXT: srli t5, a3, 16
-; RV32I-NEXT: sb t5, 30(a2)
-; RV32I-NEXT: sb a3, 28(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 29(a2)
-; RV32I-NEXT: srli a3, t4, 16
-; RV32I-NEXT: sb a3, 18(a2)
-; RV32I-NEXT: or a3, t4, s1
-; RV32I-NEXT: sb t4, 16(a2)
-; RV32I-NEXT: srli t4, t4, 8
-; RV32I-NEXT: sb t4, 17(a2)
-; RV32I-NEXT: srli t4, t3, 16
-; RV32I-NEXT: sb t4, 22(a2)
-; RV32I-NEXT: or t1, t3, t1
-; RV32I-NEXT: sb t3, 20(a2)
-; RV32I-NEXT: srli t3, t3, 8
-; RV32I-NEXT: sb t3, 21(a2)
-; RV32I-NEXT: srli t3, a6, 16
-; RV32I-NEXT: sb t3, 10(a2)
-; RV32I-NEXT: or t3, a6, s2
-; RV32I-NEXT: sb a6, 8(a2)
-; RV32I-NEXT: srli a6, a6, 8
-; RV32I-NEXT: sb a6, 9(a2)
-; RV32I-NEXT: srli a6, t0, 16
-; RV32I-NEXT: sb a6, 14(a2)
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: sb t0, 12(a2)
-; RV32I-NEXT: srli a7, t0, 8
-; RV32I-NEXT: sb a7, 13(a2)
-; RV32I-NEXT: srli a7, a5, 16
-; RV32I-NEXT: sb a7, 2(a2)
-; RV32I-NEXT: or a1, a5, a1
-; RV32I-NEXT: sb a5, 0(a2)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 1(a2)
-; RV32I-NEXT: srli a5, a4, 16
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: or a0, a4, a0
-; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, a0, t4
+; RV32I-NEXT: or t3, t3, a7
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t4, 0(a1)
+; RV32I-NEXT: lbu t5, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t4
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t5
+; RV32I-NEXT: or a7, a1, a7
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a0, 56(sp)
+; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 44(sp)
+; RV32I-NEXT: sw a0, 40(sp)
+; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw t3, 28(sp)
+; RV32I-NEXT: sw t2, 24(sp)
+; RV32I-NEXT: sw t1, 20(sp)
+; RV32I-NEXT: sw t0, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a0, a7, 3
+; RV32I-NEXT: andi a0, a0, 28
+; RV32I-NEXT: mv a1, sp
+; RV32I-NEXT: add a4, a1, a0
+; RV32I-NEXT: lw a1, 4(a4)
+; RV32I-NEXT: srl a0, a1, a7
+; RV32I-NEXT: lw a5, 8(a4)
+; RV32I-NEXT: andi a3, a7, 31
+; RV32I-NEXT: xori a6, a3, 31
+; RV32I-NEXT: lw a3, 0(a4)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: srl a3, a3, a7
+; RV32I-NEXT: slli a1, a1, 1
+; RV32I-NEXT: lw t0, 12(a4)
+; RV32I-NEXT: lw t1, 16(a4)
+; RV32I-NEXT: sll a1, a1, a6
+; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: srl a3, t0, a7
+; RV32I-NEXT: slli t2, t1, 1
+; RV32I-NEXT: sll t2, t2, a6
+; RV32I-NEXT: or a3, a3, t2
+; RV32I-NEXT: srl a5, a5, a7
+; RV32I-NEXT: slli t0, t0, 1
+; RV32I-NEXT: lw t2, 20(a4)
+; RV32I-NEXT: lw t3, 24(a4)
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a5, a5, t0
+; RV32I-NEXT: srl t0, t2, a7
+; RV32I-NEXT: slli t4, t3, 1
+; RV32I-NEXT: sll t4, t4, a6
+; RV32I-NEXT: or t0, t0, t4
+; RV32I-NEXT: srl t1, t1, a7
+; RV32I-NEXT: slli t2, t2, 1
+; RV32I-NEXT: lw a4, 28(a4)
+; RV32I-NEXT: sll t2, t2, a6
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: srl t2, t3, a7
+; RV32I-NEXT: slli t3, a4, 1
+; RV32I-NEXT: sll a6, t3, a6
+; RV32I-NEXT: or a6, t2, a6
+; RV32I-NEXT: sra a4, a4, a7
+; RV32I-NEXT: sb a4, 28(a2)
+; RV32I-NEXT: srli a7, a4, 24
+; RV32I-NEXT: sb a7, 31(a2)
+; RV32I-NEXT: srli a7, a4, 16
+; RV32I-NEXT: sb a7, 30(a2)
; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 5(a2)
-; RV32I-NEXT: srli a4, t2, 24
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb t1, 16(a2)
+; RV32I-NEXT: sb t0, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a3, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a4, a6, 24
; RV32I-NEXT: sb a4, 27(a2)
-; RV32I-NEXT: srli a3, a3, 24
-; RV32I-NEXT: sb a3, 19(a2)
-; RV32I-NEXT: srli a3, t1, 24
-; RV32I-NEXT: sb a3, 23(a2)
-; RV32I-NEXT: srli a3, t3, 24
-; RV32I-NEXT: sb a3, 11(a2)
-; RV32I-NEXT: srli a3, a6, 24
-; RV32I-NEXT: sb a3, 15(a2)
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: sb a1, 3(a2)
-; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: srli a4, a6, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, a6, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, t1, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t1, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t1, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 21(a2)
+; RV32I-NEXT: srli a4, a5, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 15(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 14(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
>From 90bdd43610ce819f50898df8b4adbc97aea9023b Mon Sep 17 00:00:00 2001
From: Gergely Futo <gergely.futo at hightec-rt.com>
Date: Fri, 21 Jun 2024 13:52:26 +0200
Subject: [PATCH 2/4] Address review comments
Use unaligned memory access only if target supports fast unaligned
memory access and the shift amount is a multiple of CHAR_BITS.
Addressing formatting reviews.
---
.../SelectionDAG/LegalizeIntegerTypes.cpp | 55 +-
.../AArch64/wide-scalar-shift-legalization.ll | 117 +-
llvm/test/CodeGen/Mips/llvm-ir/ashr.ll | 331 +-
llvm/test/CodeGen/Mips/llvm-ir/lshr.ll | 323 +-
llvm/test/CodeGen/Mips/llvm-ir/shl.ll | 302 +-
llvm/test/CodeGen/PowerPC/ctrloop-sh.ll | 244 +-
llvm/test/CodeGen/PowerPC/pr59074.ll | 80 +-
.../PowerPC/wide-scalar-shift-legalization.ll | 644 +-
.../X86/div-rem-pair-recomposition-signed.ll | 431 +-
.../div-rem-pair-recomposition-unsigned.ll | 346 +-
llvm/test/CodeGen/X86/pr38539.ll | 144 +-
.../CodeGen/X86/scheduler-backtracking.ll | 50 +-
llvm/test/CodeGen/X86/shift-i128.ll | 546 +-
llvm/test/CodeGen/X86/shift-i256.ll | 358 +-
.../X86/wide-scalar-shift-legalization.ll | 7086 ++++++++---------
15 files changed, 5019 insertions(+), 6038 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index f21ed7581a5af..cd40df473c67c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4532,19 +4532,29 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
EVT LoadStoreVT = VT;
do {
- LoadStoreVT = TLI.getTypeToTransformTo(*DAG.getContext(), LoadStoreVT);
- }while (!TLI.isTypeLegal(LoadStoreVT));
+ LoadStoreVT = TLI.getTypeToTransformTo(*DAG.getContext(), LoadStoreVT);
+ } while (!TLI.isTypeLegal(LoadStoreVT));
- const Align LoadStoreAlign = [&]() -> Align {
- if (TLI.allowsMisalignedMemoryAccesses(LoadStoreVT))
- return Align(1);
+ const unsigned KnownTrailingZeros =
+ DAG.computeKnownBits(ShAmt).countMinTrailingZeros();
- return DAG.getReducedAlign(LoadStoreVT, /*UseABI=*/false);
+ const Align LoadStoreAlign = [&]() -> Align {
+ unsigned IsFast = 0;
+ const bool AllowsFastMisalignedMemoryAccesses =
+ TLI.allowsMisalignedMemoryAccesses(
+ LoadStoreVT, /*AddrSpace*/ 0, /*Alignment*/ Align(1),
+ /*Flags*/ MachineMemOperand::MONone, &IsFast) &&
+ IsFast;
+ if (AllowsFastMisalignedMemoryAccesses && KnownTrailingZeros >= 3)
+ return Align(1);
+
+ return DAG.getReducedAlign(LoadStoreVT, /*UseABI=*/false);
}();
const unsigned ShiftUnitInBits = LoadStoreAlign.value() * 8;
const bool IsOneStepShift =
- DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= Log2_32(ShiftUnitInBits);
+ DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >=
+ Log2_32(ShiftUnitInBits);
// If we can't do it as one step, we'll have two uses of shift amount,
// and thus must freeze it.
@@ -4590,20 +4600,23 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
// We have shift amount, which is in bits. Offset should point to an aligned
// address.
SDNodeFlags Flags;
+
if (IsOneStepShift)
Flags.setExact(true);
- SDValue OffsetInBits = DAG.getNode(ISD::SHL, dl, ShAmtVT,
- DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt, DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT), Flags),
- DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT));
+ SDValue SrlTmp = DAG.getNode(
+ ISD::SRL, dl, ShAmtVT, ShAmt,
+ DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT), Flags);
+ SDValue OffsetInBits =
+ DAG.getNode(ISD::SHL, dl, ShAmtVT, SrlTmp,
+ DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT));
+
Flags.setExact(true);
- SDValue Offset = DAG.getNode(
- ISD::SRL, dl, ShAmtVT,
- OffsetInBits,
- DAG.getConstant(3, dl, ShAmtVT), Flags);
+ SDValue Offset = DAG.getNode(ISD::SRL, dl, ShAmtVT, OffsetInBits,
+ DAG.getConstant(3, dl, ShAmtVT), Flags);
// And clamp it, because OOB load is an immediate UB,
// while shift overflow would have *just* been poison.
Offset = DAG.getNode(ISD::AND, dl, ShAmtVT, Offset,
- DAG.getConstant(VTByteWidth - 1, dl, ShAmtVT));
+ DAG.getConstant(VTByteWidth - 1, dl, ShAmtVT));
// We have exactly two strategies on indexing into stack slot here:
// 1. upwards starting from the beginning of the slot
// 2. downwards starting from the middle of the slot
@@ -4627,15 +4640,17 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, Offset, dl);
// And load it! While the load is not legal, legalizing it is obvious.
- SDValue Res = DAG.getLoad(
- VT, dl, Ch, AdjStackPtr,
- MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), LoadStoreAlign);
+ SDValue Res =
+ DAG.getLoad(VT, dl, Ch, AdjStackPtr,
+ MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()),
+ LoadStoreAlign);
// We've performed the shift by a CHAR_BIT * [ShAmt / LoadAlign]
// If we may still have a remaining bits to shift by, do so now.
if (!IsOneStepShift) {
- SDValue ShAmtRem = DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
- DAG.getConstant(ShiftUnitInBits - 1, dl, ShAmtVT));
+ SDValue ShAmtRem =
+ DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
+ DAG.getConstant(ShiftUnitInBits - 1, dl, ShAmtVT));
Res = DAG.getNode(N->getOpcode(), dl, VT, Res, ShAmtRem);
}
diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
index a4da6db57ecae..531e0fa740da7 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
@@ -160,30 +160,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q1, [x0]
; ALL-NEXT: stp x9, x8, [sp, #16]
-; ALL-NEXT: ubfx x8, x10, #3, #5
+; ALL-NEXT: lsr x8, x10, #3
; ALL-NEXT: mov x9, sp
; ALL-NEXT: str q1, [sp]
-; ALL-NEXT: and x10, x10, #0x7
+; ALL-NEXT: and x12, x10, #0x3f
+; ALL-NEXT: and x8, x8, #0x18
; ALL-NEXT: stp q0, q0, [sp, #32]
+; ALL-NEXT: eor x12, x12, #0x3f
; ALL-NEXT: add x8, x9, x8
-; ALL-NEXT: mvn w13, w10
-; ALL-NEXT: ldp x11, x9, [x8, #16]
-; ALL-NEXT: ldp x8, x12, [x8]
+; ALL-NEXT: ldp x13, x11, [x8]
+; ALL-NEXT: ldr x9, [x8, #24]
+; ALL-NEXT: ldr x8, [x8, #16]
; ALL-NEXT: lsl x14, x9, #1
+; ALL-NEXT: lsr x9, x9, x10
; ALL-NEXT: lsl x15, x11, #1
; ALL-NEXT: lsr x11, x11, x10
-; ALL-NEXT: lsl x16, x12, #1
-; ALL-NEXT: lsr x9, x9, x10
-; ALL-NEXT: lsr x12, x12, x10
-; ALL-NEXT: lsl x14, x14, x13
+; ALL-NEXT: lsr x13, x13, x10
+; ALL-NEXT: lsl x14, x14, x12
+; ALL-NEXT: lsl x12, x15, x12
+; ALL-NEXT: lsl x15, x8, #1
; ALL-NEXT: lsr x8, x8, x10
-; ALL-NEXT: lsl x10, x16, x13
-; ALL-NEXT: lsl x13, x15, x13
-; ALL-NEXT: orr x11, x14, x11
-; ALL-NEXT: stp x11, x9, [x2, #16]
-; ALL-NEXT: orr x8, x10, x8
+; ALL-NEXT: mvn w10, w10
+; ALL-NEXT: lsl x10, x15, x10
+; ALL-NEXT: orr x8, x14, x8
+; ALL-NEXT: stp x8, x9, [x2, #16]
; ALL-NEXT: orr x9, x12, x13
-; ALL-NEXT: stp x8, x9, [x2]
+; ALL-NEXT: orr x8, x11, x10
+; ALL-NEXT: stp x9, x8, [x2]
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
@@ -201,31 +204,34 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q1, [x0]
; ALL-NEXT: stp x9, x8, [sp, #48]
-; ALL-NEXT: mov x8, sp
-; ALL-NEXT: ubfx x9, x10, #3, #5
-; ALL-NEXT: add x8, x8, #32
+; ALL-NEXT: lsr x8, x10, #3
+; ALL-NEXT: mov x9, sp
+; ALL-NEXT: add x9, x9, #32
; ALL-NEXT: stp q0, q1, [sp, #16]
-; ALL-NEXT: and x10, x10, #0x7
+; ALL-NEXT: and x12, x10, #0x3f
+; ALL-NEXT: and x8, x8, #0x18
; ALL-NEXT: str q0, [sp]
-; ALL-NEXT: sub x8, x8, x9
-; ALL-NEXT: mvn w13, w10
-; ALL-NEXT: ldp x9, x11, [x8]
-; ALL-NEXT: ldp x12, x8, [x8, #16]
-; ALL-NEXT: lsr x14, x9, #1
-; ALL-NEXT: lsr x15, x11, #1
-; ALL-NEXT: lsl x11, x11, x10
-; ALL-NEXT: lsr x16, x12, #1
+; ALL-NEXT: eor x12, x12, #0x3f
+; ALL-NEXT: sub x8, x9, x8
+; ALL-NEXT: ldp x11, x13, [x8, #16]
+; ALL-NEXT: ldr x9, [x8]
+; ALL-NEXT: ldr x8, [x8, #8]
+; ALL-NEXT: lsr x15, x9, #1
; ALL-NEXT: lsl x9, x9, x10
-; ALL-NEXT: lsl x12, x12, x10
-; ALL-NEXT: lsr x14, x14, x13
+; ALL-NEXT: lsr x14, x11, #1
+; ALL-NEXT: lsl x11, x11, x10
+; ALL-NEXT: lsl x13, x13, x10
+; ALL-NEXT: lsr x14, x14, x12
+; ALL-NEXT: lsr x12, x15, x12
+; ALL-NEXT: lsr x15, x8, #1
; ALL-NEXT: lsl x8, x8, x10
-; ALL-NEXT: lsr x10, x16, x13
-; ALL-NEXT: lsr x13, x15, x13
-; ALL-NEXT: orr x11, x11, x14
-; ALL-NEXT: stp x9, x11, [x2]
-; ALL-NEXT: orr x8, x8, x10
-; ALL-NEXT: orr x9, x12, x13
-; ALL-NEXT: stp x9, x8, [x2, #16]
+; ALL-NEXT: mvn w10, w10
+; ALL-NEXT: lsr x10, x15, x10
+; ALL-NEXT: orr x8, x8, x12
+; ALL-NEXT: stp x9, x8, [x2]
+; ALL-NEXT: orr x9, x13, x14
+; ALL-NEXT: orr x8, x11, x10
+; ALL-NEXT: stp x8, x9, [x2, #16]
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
@@ -243,31 +249,34 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q0, [x0]
; ALL-NEXT: stp x9, x8, [sp, #16]
+; ALL-NEXT: lsr x9, x10, #3
; ALL-NEXT: asr x8, x8, #63
-; ALL-NEXT: ubfx x9, x10, #3, #5
; ALL-NEXT: str q0, [sp]
-; ALL-NEXT: and x10, x10, #0x7
+; ALL-NEXT: and x12, x10, #0x3f
+; ALL-NEXT: and x9, x9, #0x18
; ALL-NEXT: stp x8, x8, [sp, #48]
-; ALL-NEXT: add x9, x11, x9
-; ALL-NEXT: mvn w13, w10
+; ALL-NEXT: eor x12, x12, #0x3f
; ALL-NEXT: stp x8, x8, [sp, #32]
-; ALL-NEXT: ldp x11, x8, [x9, #16]
-; ALL-NEXT: ldp x9, x12, [x9]
-; ALL-NEXT: lsl x14, x8, #1
+; ALL-NEXT: add x8, x11, x9
+; ALL-NEXT: ldp x13, x11, [x8]
+; ALL-NEXT: ldr x9, [x8, #24]
+; ALL-NEXT: ldr x8, [x8, #16]
+; ALL-NEXT: lsl x14, x9, #1
+; ALL-NEXT: asr x9, x9, x10
; ALL-NEXT: lsl x15, x11, #1
; ALL-NEXT: lsr x11, x11, x10
-; ALL-NEXT: lsl x16, x12, #1
-; ALL-NEXT: asr x8, x8, x10
-; ALL-NEXT: lsr x12, x12, x10
-; ALL-NEXT: lsl x14, x14, x13
-; ALL-NEXT: lsr x9, x9, x10
-; ALL-NEXT: lsl x10, x16, x13
-; ALL-NEXT: lsl x13, x15, x13
-; ALL-NEXT: orr x11, x14, x11
-; ALL-NEXT: stp x11, x8, [x2, #16]
-; ALL-NEXT: orr x8, x10, x9
+; ALL-NEXT: lsr x13, x13, x10
+; ALL-NEXT: lsl x14, x14, x12
+; ALL-NEXT: lsl x12, x15, x12
+; ALL-NEXT: lsl x15, x8, #1
+; ALL-NEXT: lsr x8, x8, x10
+; ALL-NEXT: mvn w10, w10
+; ALL-NEXT: lsl x10, x15, x10
+; ALL-NEXT: orr x8, x14, x8
+; ALL-NEXT: stp x8, x9, [x2, #16]
; ALL-NEXT: orr x9, x12, x13
-; ALL-NEXT: stp x8, x9, [x2]
+; ALL-NEXT: orr x8, x11, x10
+; ALL-NEXT: stp x9, x8, [x2]
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
index 450fe968d4917..6db3fb930b94e 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -382,53 +382,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MIPS: # %bb.0: # %entry
; MIPS-NEXT: addiu $sp, $sp, -32
; MIPS-NEXT: .cfi_def_cfa_offset 32
-; MIPS-NEXT: swl $7, 28($sp)
-; MIPS-NEXT: swl $6, 24($sp)
; MIPS-NEXT: sra $1, $4, 31
-; MIPS-NEXT: swl $5, 20($sp)
-; MIPS-NEXT: swl $4, 16($sp)
-; MIPS-NEXT: swl $1, 12($sp)
-; MIPS-NEXT: swl $1, 8($sp)
-; MIPS-NEXT: swl $1, 4($sp)
-; MIPS-NEXT: swl $1, 0($sp)
-; MIPS-NEXT: addiu $2, $sp, 0
-; MIPS-NEXT: swr $7, 31($sp)
-; MIPS-NEXT: swr $6, 27($sp)
-; MIPS-NEXT: swr $5, 23($sp)
-; MIPS-NEXT: swr $4, 19($sp)
-; MIPS-NEXT: swr $1, 15($sp)
-; MIPS-NEXT: swr $1, 11($sp)
-; MIPS-NEXT: swr $1, 7($sp)
-; MIPS-NEXT: swr $1, 3($sp)
-; MIPS-NEXT: addiu $1, $2, 16
+; MIPS-NEXT: sw $7, 28($sp)
+; MIPS-NEXT: sw $6, 24($sp)
+; MIPS-NEXT: sw $5, 20($sp)
+; MIPS-NEXT: sw $4, 16($sp)
+; MIPS-NEXT: sw $1, 12($sp)
+; MIPS-NEXT: sw $1, 8($sp)
+; MIPS-NEXT: sw $1, 4($sp)
+; MIPS-NEXT: sw $1, 0($sp)
+; MIPS-NEXT: addiu $1, $sp, 0
+; MIPS-NEXT: addiu $1, $1, 16
; MIPS-NEXT: lw $2, 60($sp)
; MIPS-NEXT: srl $3, $2, 3
-; MIPS-NEXT: andi $3, $3, 15
+; MIPS-NEXT: andi $3, $3, 12
; MIPS-NEXT: subu $1, $1, $3
-; MIPS-NEXT: lwl $3, 4($1)
-; MIPS-NEXT: lwr $3, 7($1)
-; MIPS-NEXT: sll $4, $3, 1
-; MIPS-NEXT: lwl $5, 8($1)
-; MIPS-NEXT: lwr $5, 11($1)
-; MIPS-NEXT: andi $2, $2, 7
-; MIPS-NEXT: not $6, $2
-; MIPS-NEXT: srlv $7, $5, $2
-; MIPS-NEXT: sllv $4, $4, $6
+; MIPS-NEXT: lw $3, 4($1)
+; MIPS-NEXT: lw $5, 8($1)
+; MIPS-NEXT: srlv $4, $5, $2
+; MIPS-NEXT: sll $6, $3, 1
+; MIPS-NEXT: andi $7, $2, 31
+; MIPS-NEXT: xori $7, $7, 31
+; MIPS-NEXT: sllv $6, $6, $7
; MIPS-NEXT: srlv $3, $3, $2
-; MIPS-NEXT: lwl $6, 0($1)
-; MIPS-NEXT: lwr $6, 3($1)
-; MIPS-NEXT: sll $8, $6, 1
-; MIPS-NEXT: xori $9, $2, 31
-; MIPS-NEXT: sllv $8, $8, $9
-; MIPS-NEXT: or $3, $3, $8
-; MIPS-NEXT: or $4, $7, $4
-; MIPS-NEXT: lwl $7, 12($1)
-; MIPS-NEXT: lwr $7, 15($1)
-; MIPS-NEXT: srlv $1, $7, $2
+; MIPS-NEXT: lw $8, 0($1)
+; MIPS-NEXT: sll $9, $8, 1
+; MIPS-NEXT: sllv $9, $9, $7
+; MIPS-NEXT: or $3, $3, $9
+; MIPS-NEXT: or $4, $4, $6
+; MIPS-NEXT: lw $1, 12($1)
+; MIPS-NEXT: srlv $1, $1, $2
; MIPS-NEXT: sll $5, $5, 1
-; MIPS-NEXT: sllv $5, $5, $9
+; MIPS-NEXT: sllv $5, $5, $7
; MIPS-NEXT: or $5, $1, $5
-; MIPS-NEXT: srav $2, $6, $2
+; MIPS-NEXT: srav $2, $8, $2
; MIPS-NEXT: jr $ra
; MIPS-NEXT: addiu $sp, $sp, 32
;
@@ -436,53 +423,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MIPS32: # %bb.0: # %entry
; MIPS32-NEXT: addiu $sp, $sp, -32
; MIPS32-NEXT: .cfi_def_cfa_offset 32
-; MIPS32-NEXT: swl $7, 28($sp)
-; MIPS32-NEXT: swl $6, 24($sp)
; MIPS32-NEXT: sra $1, $4, 31
-; MIPS32-NEXT: swl $5, 20($sp)
-; MIPS32-NEXT: swl $4, 16($sp)
-; MIPS32-NEXT: swl $1, 12($sp)
-; MIPS32-NEXT: swl $1, 8($sp)
-; MIPS32-NEXT: swl $1, 4($sp)
-; MIPS32-NEXT: swl $1, 0($sp)
-; MIPS32-NEXT: addiu $2, $sp, 0
-; MIPS32-NEXT: swr $7, 31($sp)
-; MIPS32-NEXT: swr $6, 27($sp)
-; MIPS32-NEXT: swr $5, 23($sp)
-; MIPS32-NEXT: swr $4, 19($sp)
-; MIPS32-NEXT: swr $1, 15($sp)
-; MIPS32-NEXT: swr $1, 11($sp)
-; MIPS32-NEXT: swr $1, 7($sp)
-; MIPS32-NEXT: swr $1, 3($sp)
-; MIPS32-NEXT: addiu $1, $2, 16
+; MIPS32-NEXT: sw $7, 28($sp)
+; MIPS32-NEXT: sw $6, 24($sp)
+; MIPS32-NEXT: sw $5, 20($sp)
+; MIPS32-NEXT: sw $4, 16($sp)
+; MIPS32-NEXT: sw $1, 12($sp)
+; MIPS32-NEXT: sw $1, 8($sp)
+; MIPS32-NEXT: sw $1, 4($sp)
+; MIPS32-NEXT: sw $1, 0($sp)
+; MIPS32-NEXT: addiu $1, $sp, 0
+; MIPS32-NEXT: addiu $1, $1, 16
; MIPS32-NEXT: lw $2, 60($sp)
; MIPS32-NEXT: srl $3, $2, 3
-; MIPS32-NEXT: andi $3, $3, 15
+; MIPS32-NEXT: andi $3, $3, 12
; MIPS32-NEXT: subu $1, $1, $3
-; MIPS32-NEXT: lwl $3, 4($1)
-; MIPS32-NEXT: lwr $3, 7($1)
-; MIPS32-NEXT: sll $4, $3, 1
-; MIPS32-NEXT: lwl $5, 8($1)
-; MIPS32-NEXT: lwr $5, 11($1)
-; MIPS32-NEXT: andi $2, $2, 7
-; MIPS32-NEXT: not $6, $2
-; MIPS32-NEXT: srlv $7, $5, $2
-; MIPS32-NEXT: sllv $4, $4, $6
+; MIPS32-NEXT: lw $3, 4($1)
+; MIPS32-NEXT: lw $5, 8($1)
+; MIPS32-NEXT: srlv $4, $5, $2
+; MIPS32-NEXT: sll $6, $3, 1
+; MIPS32-NEXT: andi $7, $2, 31
+; MIPS32-NEXT: xori $7, $7, 31
+; MIPS32-NEXT: sllv $6, $6, $7
; MIPS32-NEXT: srlv $3, $3, $2
-; MIPS32-NEXT: lwl $6, 0($1)
-; MIPS32-NEXT: lwr $6, 3($1)
-; MIPS32-NEXT: sll $8, $6, 1
-; MIPS32-NEXT: xori $9, $2, 31
-; MIPS32-NEXT: sllv $8, $8, $9
-; MIPS32-NEXT: or $3, $3, $8
-; MIPS32-NEXT: or $4, $7, $4
-; MIPS32-NEXT: lwl $7, 12($1)
-; MIPS32-NEXT: lwr $7, 15($1)
-; MIPS32-NEXT: srlv $1, $7, $2
+; MIPS32-NEXT: lw $8, 0($1)
+; MIPS32-NEXT: sll $9, $8, 1
+; MIPS32-NEXT: sllv $9, $9, $7
+; MIPS32-NEXT: or $3, $3, $9
+; MIPS32-NEXT: or $4, $4, $6
+; MIPS32-NEXT: lw $1, 12($1)
+; MIPS32-NEXT: srlv $1, $1, $2
; MIPS32-NEXT: sll $5, $5, 1
-; MIPS32-NEXT: sllv $5, $5, $9
+; MIPS32-NEXT: sllv $5, $5, $7
; MIPS32-NEXT: or $5, $1, $5
-; MIPS32-NEXT: srav $2, $6, $2
+; MIPS32-NEXT: srav $2, $8, $2
; MIPS32-NEXT: jr $ra
; MIPS32-NEXT: addiu $sp, $sp, 32
;
@@ -490,52 +464,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; 32R2: # %bb.0: # %entry
; 32R2-NEXT: addiu $sp, $sp, -32
; 32R2-NEXT: .cfi_def_cfa_offset 32
-; 32R2-NEXT: swl $7, 28($sp)
-; 32R2-NEXT: swl $6, 24($sp)
-; 32R2-NEXT: swl $5, 20($sp)
; 32R2-NEXT: sra $1, $4, 31
-; 32R2-NEXT: swl $4, 16($sp)
-; 32R2-NEXT: swl $1, 12($sp)
-; 32R2-NEXT: swl $1, 8($sp)
-; 32R2-NEXT: swl $1, 4($sp)
-; 32R2-NEXT: swl $1, 0($sp)
-; 32R2-NEXT: swr $7, 31($sp)
-; 32R2-NEXT: swr $6, 27($sp)
-; 32R2-NEXT: swr $5, 23($sp)
-; 32R2-NEXT: swr $4, 19($sp)
-; 32R2-NEXT: swr $1, 15($sp)
-; 32R2-NEXT: swr $1, 11($sp)
-; 32R2-NEXT: swr $1, 7($sp)
-; 32R2-NEXT: swr $1, 3($sp)
+; 32R2-NEXT: sw $7, 28($sp)
+; 32R2-NEXT: sw $6, 24($sp)
+; 32R2-NEXT: sw $5, 20($sp)
+; 32R2-NEXT: sw $4, 16($sp)
+; 32R2-NEXT: sw $1, 12($sp)
+; 32R2-NEXT: sw $1, 8($sp)
+; 32R2-NEXT: sw $1, 4($sp)
+; 32R2-NEXT: sw $1, 0($sp)
; 32R2-NEXT: addiu $1, $sp, 0
; 32R2-NEXT: addiu $1, $1, 16
; 32R2-NEXT: lw $2, 60($sp)
-; 32R2-NEXT: ext $3, $2, 3, 4
+; 32R2-NEXT: srl $3, $2, 3
+; 32R2-NEXT: andi $3, $3, 12
; 32R2-NEXT: subu $1, $1, $3
-; 32R2-NEXT: lwl $3, 4($1)
-; 32R2-NEXT: lwr $3, 7($1)
-; 32R2-NEXT: sll $4, $3, 1
-; 32R2-NEXT: lwl $5, 8($1)
-; 32R2-NEXT: lwr $5, 11($1)
-; 32R2-NEXT: andi $2, $2, 7
-; 32R2-NEXT: not $6, $2
-; 32R2-NEXT: srlv $7, $5, $2
-; 32R2-NEXT: sllv $4, $4, $6
+; 32R2-NEXT: lw $3, 4($1)
+; 32R2-NEXT: lw $5, 8($1)
+; 32R2-NEXT: srlv $4, $5, $2
+; 32R2-NEXT: sll $6, $3, 1
+; 32R2-NEXT: andi $7, $2, 31
+; 32R2-NEXT: xori $7, $7, 31
+; 32R2-NEXT: sllv $6, $6, $7
; 32R2-NEXT: srlv $3, $3, $2
-; 32R2-NEXT: lwl $6, 0($1)
-; 32R2-NEXT: lwr $6, 3($1)
-; 32R2-NEXT: sll $8, $6, 1
-; 32R2-NEXT: xori $9, $2, 31
-; 32R2-NEXT: sllv $8, $8, $9
-; 32R2-NEXT: or $3, $3, $8
-; 32R2-NEXT: or $4, $7, $4
-; 32R2-NEXT: lwl $7, 12($1)
-; 32R2-NEXT: lwr $7, 15($1)
-; 32R2-NEXT: srlv $1, $7, $2
+; 32R2-NEXT: lw $8, 0($1)
+; 32R2-NEXT: sll $9, $8, 1
+; 32R2-NEXT: sllv $9, $9, $7
+; 32R2-NEXT: or $3, $3, $9
+; 32R2-NEXT: or $4, $4, $6
+; 32R2-NEXT: lw $1, 12($1)
+; 32R2-NEXT: srlv $1, $1, $2
; 32R2-NEXT: sll $5, $5, 1
-; 32R2-NEXT: sllv $5, $5, $9
+; 32R2-NEXT: sllv $5, $5, $7
; 32R2-NEXT: or $5, $1, $5
-; 32R2-NEXT: srav $2, $6, $2
+; 32R2-NEXT: srav $2, $8, $2
; 32R2-NEXT: jr $ra
; 32R2-NEXT: addiu $sp, $sp, 32
;
@@ -555,28 +517,28 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; 32R6-NEXT: addiu $1, $sp, 0
; 32R6-NEXT: addiu $1, $1, 16
; 32R6-NEXT: lw $2, 60($sp)
-; 32R6-NEXT: ext $3, $2, 3, 4
+; 32R6-NEXT: srl $3, $2, 3
+; 32R6-NEXT: andi $3, $3, 12
; 32R6-NEXT: subu $1, $1, $3
; 32R6-NEXT: lw $3, 4($1)
-; 32R6-NEXT: sll $4, $3, 1
; 32R6-NEXT: lw $5, 8($1)
-; 32R6-NEXT: andi $2, $2, 7
-; 32R6-NEXT: not $6, $2
-; 32R6-NEXT: srlv $7, $5, $2
-; 32R6-NEXT: sllv $4, $4, $6
+; 32R6-NEXT: srlv $4, $5, $2
+; 32R6-NEXT: sll $6, $3, 1
+; 32R6-NEXT: andi $7, $2, 31
+; 32R6-NEXT: xori $7, $7, 31
+; 32R6-NEXT: sllv $6, $6, $7
; 32R6-NEXT: srlv $3, $3, $2
-; 32R6-NEXT: lw $6, 0($1)
-; 32R6-NEXT: sll $8, $6, 1
-; 32R6-NEXT: xori $9, $2, 31
-; 32R6-NEXT: sllv $8, $8, $9
-; 32R6-NEXT: or $3, $3, $8
-; 32R6-NEXT: or $4, $7, $4
+; 32R6-NEXT: lw $8, 0($1)
+; 32R6-NEXT: sll $9, $8, 1
+; 32R6-NEXT: sllv $9, $9, $7
+; 32R6-NEXT: or $3, $3, $9
+; 32R6-NEXT: or $4, $4, $6
; 32R6-NEXT: lw $1, 12($1)
; 32R6-NEXT: srlv $1, $1, $2
; 32R6-NEXT: sll $5, $5, 1
-; 32R6-NEXT: sllv $5, $5, $9
+; 32R6-NEXT: sllv $5, $5, $7
; 32R6-NEXT: or $5, $1, $5
-; 32R6-NEXT: srav $2, $6, $2
+; 32R6-NEXT: srav $2, $8, $2
; 32R6-NEXT: jr $ra
; 32R6-NEXT: addiu $sp, $sp, 32
;
@@ -656,53 +618,37 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MMR3-NEXT: swp $16, 32($sp)
; MMR3-NEXT: .cfi_offset 17, -4
; MMR3-NEXT: .cfi_offset 16, -8
-; MMR3-NEXT: swl $7, 28($sp)
-; MMR3-NEXT: swl $6, 24($sp)
-; MMR3-NEXT: swl $5, 20($sp)
; MMR3-NEXT: sra $1, $4, 31
-; MMR3-NEXT: swl $4, 16($sp)
-; MMR3-NEXT: swl $1, 12($sp)
-; MMR3-NEXT: swl $1, 8($sp)
-; MMR3-NEXT: swl $1, 4($sp)
-; MMR3-NEXT: swl $1, 0($sp)
-; MMR3-NEXT: swr $7, 31($sp)
-; MMR3-NEXT: swr $6, 27($sp)
-; MMR3-NEXT: swr $5, 23($sp)
-; MMR3-NEXT: swr $4, 19($sp)
-; MMR3-NEXT: swr $1, 15($sp)
-; MMR3-NEXT: swr $1, 11($sp)
-; MMR3-NEXT: swr $1, 7($sp)
-; MMR3-NEXT: swr $1, 3($sp)
+; MMR3-NEXT: swp $6, 24($sp)
+; MMR3-NEXT: swp $4, 16($sp)
+; MMR3-NEXT: sw $1, 12($sp)
+; MMR3-NEXT: sw $1, 8($sp)
+; MMR3-NEXT: sw $1, 4($sp)
+; MMR3-NEXT: sw $1, 0($sp)
; MMR3-NEXT: addiur1sp $2, 0
; MMR3-NEXT: addiur2 $2, $2, 16
; MMR3-NEXT: lw $3, 68($sp)
-; MMR3-NEXT: ext $4, $3, 3, 4
-; MMR3-NEXT: subu16 $2, $2, $4
-; MMR3-NEXT: lwl $7, 4($2)
-; MMR3-NEXT: lwr $7, 7($2)
-; MMR3-NEXT: sll16 $4, $7, 1
-; MMR3-NEXT: lwl $5, 8($2)
-; MMR3-NEXT: lwr $5, 11($2)
-; MMR3-NEXT: andi16 $6, $3, 7
-; MMR3-NEXT: not16 $3, $6
-; MMR3-NEXT: andi16 $3, $3, 31
-; MMR3-NEXT: srlv $16, $5, $6
-; MMR3-NEXT: sllv $4, $4, $3
-; MMR3-NEXT: srlv $17, $7, $6
-; MMR3-NEXT: lwl $7, 0($2)
-; MMR3-NEXT: lwr $7, 3($2)
-; MMR3-NEXT: sll16 $3, $7, 1
-; MMR3-NEXT: xori $1, $6, 31
+; MMR3-NEXT: srl16 $4, $3, 3
+; MMR3-NEXT: andi $4, $4, 12
+; MMR3-NEXT: subu16 $5, $2, $4
+; MMR3-NEXT: lwp $6, 4($5)
+; MMR3-NEXT: andi16 $2, $3, 31
+; MMR3-NEXT: srlv $16, $7, $2
+; MMR3-NEXT: sll16 $3, $6, 1
+; MMR3-NEXT: xori $1, $2, 31
+; MMR3-NEXT: sllv $4, $3, $1
+; MMR3-NEXT: srlv $6, $6, $2
+; MMR3-NEXT: lw16 $17, 0($5)
+; MMR3-NEXT: sll16 $3, $17, 1
; MMR3-NEXT: sllv $3, $3, $1
-; MMR3-NEXT: or16 $3, $17
+; MMR3-NEXT: or16 $3, $6
; MMR3-NEXT: or16 $4, $16
-; MMR3-NEXT: lwl $8, 12($2)
-; MMR3-NEXT: lwr $8, 15($2)
-; MMR3-NEXT: srlv $2, $8, $6
-; MMR3-NEXT: sll16 $5, $5, 1
+; MMR3-NEXT: lw16 $5, 12($5)
+; MMR3-NEXT: srlv $6, $5, $2
+; MMR3-NEXT: sll16 $5, $7, 1
; MMR3-NEXT: sllv $5, $5, $1
-; MMR3-NEXT: or16 $5, $2
-; MMR3-NEXT: srav $2, $7, $6
+; MMR3-NEXT: or16 $5, $6
+; MMR3-NEXT: srav $2, $17, $2
; MMR3-NEXT: lwp $16, 32($sp)
; MMR3-NEXT: addiusp 40
; MMR3-NEXT: jrc $ra
@@ -725,29 +671,28 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MMR6-NEXT: addiu $2, $sp, 4
; MMR6-NEXT: addiur2 $2, $2, 16
; MMR6-NEXT: lw $3, 68($sp)
-; MMR6-NEXT: ext $4, $3, 3, 4
-; MMR6-NEXT: subu16 $5, $2, $4
-; MMR6-NEXT: lw16 $4, 4($5)
-; MMR6-NEXT: sll16 $6, $4, 1
-; MMR6-NEXT: lw16 $7, 8($5)
-; MMR6-NEXT: andi16 $2, $3, 7
-; MMR6-NEXT: not16 $3, $2
-; MMR6-NEXT: andi16 $3, $3, 31
-; MMR6-NEXT: srlv $1, $7, $2
-; MMR6-NEXT: sllv $6, $6, $3
-; MMR6-NEXT: srlv $3, $4, $2
-; MMR6-NEXT: lw16 $16, 0($5)
+; MMR6-NEXT: srl16 $4, $3, 3
+; MMR6-NEXT: andi $4, $4, 12
+; MMR6-NEXT: subu16 $2, $2, $4
+; MMR6-NEXT: lw16 $4, 4($2)
+; MMR6-NEXT: lw16 $5, 8($2)
+; MMR6-NEXT: andi16 $6, $3, 31
+; MMR6-NEXT: srlv $1, $5, $6
+; MMR6-NEXT: sll16 $3, $4, 1
+; MMR6-NEXT: xori $7, $6, 31
+; MMR6-NEXT: sllv $8, $3, $7
+; MMR6-NEXT: srlv $3, $4, $6
+; MMR6-NEXT: lw16 $16, 0($2)
; MMR6-NEXT: sll16 $4, $16, 1
-; MMR6-NEXT: xori $8, $2, 31
-; MMR6-NEXT: sllv $4, $4, $8
+; MMR6-NEXT: sllv $4, $4, $7
; MMR6-NEXT: or $3, $3, $4
-; MMR6-NEXT: or $4, $1, $6
-; MMR6-NEXT: lw16 $5, 12($5)
-; MMR6-NEXT: srlv $1, $5, $2
-; MMR6-NEXT: sll16 $5, $7, 1
-; MMR6-NEXT: sllv $5, $5, $8
-; MMR6-NEXT: or $5, $1, $5
-; MMR6-NEXT: srav $2, $16, $2
+; MMR6-NEXT: or $4, $1, $8
+; MMR6-NEXT: lw16 $2, 12($2)
+; MMR6-NEXT: srlv $1, $2, $6
+; MMR6-NEXT: sll16 $2, $5, 1
+; MMR6-NEXT: sllv $2, $2, $7
+; MMR6-NEXT: or $5, $1, $2
+; MMR6-NEXT: srav $2, $16, $6
; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload
; MMR6-NEXT: addiu $sp, $sp, 40
; MMR6-NEXT: jrc $ra
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
index 03cf104e3120c..fa10293c0f6fb 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -398,52 +398,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS2: # %bb.0: # %entry
; MIPS2-NEXT: addiu $sp, $sp, -32
; MIPS2-NEXT: .cfi_def_cfa_offset 32
-; MIPS2-NEXT: swl $7, 28($sp)
-; MIPS2-NEXT: swl $6, 24($sp)
-; MIPS2-NEXT: swl $5, 20($sp)
-; MIPS2-NEXT: swl $4, 16($sp)
-; MIPS2-NEXT: swl $zero, 12($sp)
-; MIPS2-NEXT: swl $zero, 8($sp)
-; MIPS2-NEXT: swl $zero, 4($sp)
-; MIPS2-NEXT: swl $zero, 0($sp)
; MIPS2-NEXT: addiu $1, $sp, 0
-; MIPS2-NEXT: swr $7, 31($sp)
-; MIPS2-NEXT: swr $6, 27($sp)
-; MIPS2-NEXT: swr $5, 23($sp)
-; MIPS2-NEXT: swr $4, 19($sp)
-; MIPS2-NEXT: swr $zero, 15($sp)
-; MIPS2-NEXT: swr $zero, 11($sp)
-; MIPS2-NEXT: swr $zero, 7($sp)
-; MIPS2-NEXT: swr $zero, 3($sp)
+; MIPS2-NEXT: sw $7, 28($sp)
+; MIPS2-NEXT: sw $6, 24($sp)
+; MIPS2-NEXT: sw $5, 20($sp)
+; MIPS2-NEXT: sw $4, 16($sp)
; MIPS2-NEXT: addiu $1, $1, 16
; MIPS2-NEXT: lw $2, 60($sp)
; MIPS2-NEXT: srl $3, $2, 3
-; MIPS2-NEXT: andi $3, $3, 15
+; MIPS2-NEXT: andi $3, $3, 12
; MIPS2-NEXT: subu $1, $1, $3
-; MIPS2-NEXT: lwl $3, 4($1)
-; MIPS2-NEXT: lwr $3, 7($1)
-; MIPS2-NEXT: sll $4, $3, 1
-; MIPS2-NEXT: lwl $5, 8($1)
-; MIPS2-NEXT: lwr $5, 11($1)
-; MIPS2-NEXT: andi $2, $2, 7
-; MIPS2-NEXT: not $6, $2
-; MIPS2-NEXT: srlv $7, $5, $2
-; MIPS2-NEXT: sllv $4, $4, $6
+; MIPS2-NEXT: sw $zero, 12($sp)
+; MIPS2-NEXT: sw $zero, 8($sp)
+; MIPS2-NEXT: sw $zero, 4($sp)
+; MIPS2-NEXT: sw $zero, 0($sp)
+; MIPS2-NEXT: lw $3, 4($1)
+; MIPS2-NEXT: lw $5, 8($1)
+; MIPS2-NEXT: srlv $4, $5, $2
+; MIPS2-NEXT: sll $6, $3, 1
+; MIPS2-NEXT: andi $7, $2, 31
+; MIPS2-NEXT: xori $7, $7, 31
+; MIPS2-NEXT: sllv $6, $6, $7
; MIPS2-NEXT: srlv $3, $3, $2
-; MIPS2-NEXT: lwl $6, 0($1)
-; MIPS2-NEXT: lwr $6, 3($1)
-; MIPS2-NEXT: sll $8, $6, 1
-; MIPS2-NEXT: xori $9, $2, 31
-; MIPS2-NEXT: sllv $8, $8, $9
-; MIPS2-NEXT: or $3, $3, $8
-; MIPS2-NEXT: or $4, $7, $4
-; MIPS2-NEXT: lwl $7, 12($1)
-; MIPS2-NEXT: lwr $7, 15($1)
-; MIPS2-NEXT: srlv $1, $7, $2
+; MIPS2-NEXT: lw $8, 0($1)
+; MIPS2-NEXT: sll $9, $8, 1
+; MIPS2-NEXT: sllv $9, $9, $7
+; MIPS2-NEXT: or $3, $3, $9
+; MIPS2-NEXT: or $4, $4, $6
+; MIPS2-NEXT: lw $1, 12($1)
+; MIPS2-NEXT: srlv $1, $1, $2
; MIPS2-NEXT: sll $5, $5, 1
-; MIPS2-NEXT: sllv $5, $5, $9
+; MIPS2-NEXT: sllv $5, $5, $7
; MIPS2-NEXT: or $5, $1, $5
-; MIPS2-NEXT: srlv $2, $6, $2
+; MIPS2-NEXT: srlv $2, $8, $2
; MIPS2-NEXT: jr $ra
; MIPS2-NEXT: addiu $sp, $sp, 32
;
@@ -451,52 +438,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS32: # %bb.0: # %entry
; MIPS32-NEXT: addiu $sp, $sp, -32
; MIPS32-NEXT: .cfi_def_cfa_offset 32
-; MIPS32-NEXT: swl $7, 28($sp)
-; MIPS32-NEXT: swl $6, 24($sp)
-; MIPS32-NEXT: swl $5, 20($sp)
-; MIPS32-NEXT: swl $4, 16($sp)
-; MIPS32-NEXT: swl $zero, 12($sp)
-; MIPS32-NEXT: swl $zero, 8($sp)
-; MIPS32-NEXT: swl $zero, 4($sp)
-; MIPS32-NEXT: swl $zero, 0($sp)
; MIPS32-NEXT: addiu $1, $sp, 0
-; MIPS32-NEXT: swr $7, 31($sp)
-; MIPS32-NEXT: swr $6, 27($sp)
-; MIPS32-NEXT: swr $5, 23($sp)
-; MIPS32-NEXT: swr $4, 19($sp)
-; MIPS32-NEXT: swr $zero, 15($sp)
-; MIPS32-NEXT: swr $zero, 11($sp)
-; MIPS32-NEXT: swr $zero, 7($sp)
-; MIPS32-NEXT: swr $zero, 3($sp)
+; MIPS32-NEXT: sw $7, 28($sp)
+; MIPS32-NEXT: sw $6, 24($sp)
+; MIPS32-NEXT: sw $5, 20($sp)
+; MIPS32-NEXT: sw $4, 16($sp)
; MIPS32-NEXT: addiu $1, $1, 16
; MIPS32-NEXT: lw $2, 60($sp)
; MIPS32-NEXT: srl $3, $2, 3
-; MIPS32-NEXT: andi $3, $3, 15
+; MIPS32-NEXT: andi $3, $3, 12
; MIPS32-NEXT: subu $1, $1, $3
-; MIPS32-NEXT: lwl $3, 4($1)
-; MIPS32-NEXT: lwr $3, 7($1)
-; MIPS32-NEXT: sll $4, $3, 1
-; MIPS32-NEXT: lwl $5, 8($1)
-; MIPS32-NEXT: lwr $5, 11($1)
-; MIPS32-NEXT: andi $2, $2, 7
-; MIPS32-NEXT: not $6, $2
-; MIPS32-NEXT: srlv $7, $5, $2
-; MIPS32-NEXT: sllv $4, $4, $6
+; MIPS32-NEXT: sw $zero, 12($sp)
+; MIPS32-NEXT: sw $zero, 8($sp)
+; MIPS32-NEXT: sw $zero, 4($sp)
+; MIPS32-NEXT: sw $zero, 0($sp)
+; MIPS32-NEXT: lw $3, 4($1)
+; MIPS32-NEXT: lw $5, 8($1)
+; MIPS32-NEXT: srlv $4, $5, $2
+; MIPS32-NEXT: sll $6, $3, 1
+; MIPS32-NEXT: andi $7, $2, 31
+; MIPS32-NEXT: xori $7, $7, 31
+; MIPS32-NEXT: sllv $6, $6, $7
; MIPS32-NEXT: srlv $3, $3, $2
-; MIPS32-NEXT: lwl $6, 0($1)
-; MIPS32-NEXT: lwr $6, 3($1)
-; MIPS32-NEXT: sll $8, $6, 1
-; MIPS32-NEXT: xori $9, $2, 31
-; MIPS32-NEXT: sllv $8, $8, $9
-; MIPS32-NEXT: or $3, $3, $8
-; MIPS32-NEXT: or $4, $7, $4
-; MIPS32-NEXT: lwl $7, 12($1)
-; MIPS32-NEXT: lwr $7, 15($1)
-; MIPS32-NEXT: srlv $1, $7, $2
+; MIPS32-NEXT: lw $8, 0($1)
+; MIPS32-NEXT: sll $9, $8, 1
+; MIPS32-NEXT: sllv $9, $9, $7
+; MIPS32-NEXT: or $3, $3, $9
+; MIPS32-NEXT: or $4, $4, $6
+; MIPS32-NEXT: lw $1, 12($1)
+; MIPS32-NEXT: srlv $1, $1, $2
; MIPS32-NEXT: sll $5, $5, 1
-; MIPS32-NEXT: sllv $5, $5, $9
+; MIPS32-NEXT: sllv $5, $5, $7
; MIPS32-NEXT: or $5, $1, $5
-; MIPS32-NEXT: srlv $2, $6, $2
+; MIPS32-NEXT: srlv $2, $8, $2
; MIPS32-NEXT: jr $ra
; MIPS32-NEXT: addiu $sp, $sp, 32
;
@@ -504,51 +478,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS32R2: # %bb.0: # %entry
; MIPS32R2-NEXT: addiu $sp, $sp, -32
; MIPS32R2-NEXT: .cfi_def_cfa_offset 32
-; MIPS32R2-NEXT: swl $7, 28($sp)
-; MIPS32R2-NEXT: swl $6, 24($sp)
-; MIPS32R2-NEXT: swl $5, 20($sp)
-; MIPS32R2-NEXT: swl $4, 16($sp)
-; MIPS32R2-NEXT: swl $zero, 12($sp)
-; MIPS32R2-NEXT: swl $zero, 8($sp)
-; MIPS32R2-NEXT: swl $zero, 4($sp)
-; MIPS32R2-NEXT: swl $zero, 0($sp)
-; MIPS32R2-NEXT: swr $7, 31($sp)
-; MIPS32R2-NEXT: swr $6, 27($sp)
-; MIPS32R2-NEXT: swr $5, 23($sp)
-; MIPS32R2-NEXT: swr $4, 19($sp)
-; MIPS32R2-NEXT: swr $zero, 15($sp)
-; MIPS32R2-NEXT: swr $zero, 11($sp)
-; MIPS32R2-NEXT: swr $zero, 7($sp)
-; MIPS32R2-NEXT: swr $zero, 3($sp)
; MIPS32R2-NEXT: addiu $1, $sp, 0
+; MIPS32R2-NEXT: sw $7, 28($sp)
+; MIPS32R2-NEXT: sw $6, 24($sp)
+; MIPS32R2-NEXT: sw $5, 20($sp)
+; MIPS32R2-NEXT: sw $4, 16($sp)
; MIPS32R2-NEXT: addiu $1, $1, 16
; MIPS32R2-NEXT: lw $2, 60($sp)
-; MIPS32R2-NEXT: ext $3, $2, 3, 4
+; MIPS32R2-NEXT: srl $3, $2, 3
+; MIPS32R2-NEXT: andi $3, $3, 12
; MIPS32R2-NEXT: subu $1, $1, $3
-; MIPS32R2-NEXT: lwl $3, 4($1)
-; MIPS32R2-NEXT: lwr $3, 7($1)
-; MIPS32R2-NEXT: sll $4, $3, 1
-; MIPS32R2-NEXT: lwl $5, 8($1)
-; MIPS32R2-NEXT: lwr $5, 11($1)
-; MIPS32R2-NEXT: andi $2, $2, 7
-; MIPS32R2-NEXT: not $6, $2
-; MIPS32R2-NEXT: srlv $7, $5, $2
-; MIPS32R2-NEXT: sllv $4, $4, $6
+; MIPS32R2-NEXT: sw $zero, 12($sp)
+; MIPS32R2-NEXT: sw $zero, 8($sp)
+; MIPS32R2-NEXT: sw $zero, 4($sp)
+; MIPS32R2-NEXT: sw $zero, 0($sp)
+; MIPS32R2-NEXT: lw $3, 4($1)
+; MIPS32R2-NEXT: lw $5, 8($1)
+; MIPS32R2-NEXT: srlv $4, $5, $2
+; MIPS32R2-NEXT: sll $6, $3, 1
+; MIPS32R2-NEXT: andi $7, $2, 31
+; MIPS32R2-NEXT: xori $7, $7, 31
+; MIPS32R2-NEXT: sllv $6, $6, $7
; MIPS32R2-NEXT: srlv $3, $3, $2
-; MIPS32R2-NEXT: lwl $6, 0($1)
-; MIPS32R2-NEXT: lwr $6, 3($1)
-; MIPS32R2-NEXT: sll $8, $6, 1
-; MIPS32R2-NEXT: xori $9, $2, 31
-; MIPS32R2-NEXT: sllv $8, $8, $9
-; MIPS32R2-NEXT: or $3, $3, $8
-; MIPS32R2-NEXT: or $4, $7, $4
-; MIPS32R2-NEXT: lwl $7, 12($1)
-; MIPS32R2-NEXT: lwr $7, 15($1)
-; MIPS32R2-NEXT: srlv $1, $7, $2
+; MIPS32R2-NEXT: lw $8, 0($1)
+; MIPS32R2-NEXT: sll $9, $8, 1
+; MIPS32R2-NEXT: sllv $9, $9, $7
+; MIPS32R2-NEXT: or $3, $3, $9
+; MIPS32R2-NEXT: or $4, $4, $6
+; MIPS32R2-NEXT: lw $1, 12($1)
+; MIPS32R2-NEXT: srlv $1, $1, $2
; MIPS32R2-NEXT: sll $5, $5, 1
-; MIPS32R2-NEXT: sllv $5, $5, $9
+; MIPS32R2-NEXT: sllv $5, $5, $7
; MIPS32R2-NEXT: or $5, $1, $5
-; MIPS32R2-NEXT: srlv $2, $6, $2
+; MIPS32R2-NEXT: srlv $2, $8, $2
; MIPS32R2-NEXT: jr $ra
; MIPS32R2-NEXT: addiu $sp, $sp, 32
;
@@ -563,32 +525,32 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS32R6-NEXT: sw $4, 16($sp)
; MIPS32R6-NEXT: addiu $1, $1, 16
; MIPS32R6-NEXT: lw $2, 60($sp)
-; MIPS32R6-NEXT: ext $3, $2, 3, 4
+; MIPS32R6-NEXT: srl $3, $2, 3
+; MIPS32R6-NEXT: andi $3, $3, 12
; MIPS32R6-NEXT: subu $1, $1, $3
; MIPS32R6-NEXT: sw $zero, 12($sp)
; MIPS32R6-NEXT: sw $zero, 8($sp)
; MIPS32R6-NEXT: sw $zero, 4($sp)
; MIPS32R6-NEXT: sw $zero, 0($sp)
; MIPS32R6-NEXT: lw $3, 4($1)
-; MIPS32R6-NEXT: sll $4, $3, 1
; MIPS32R6-NEXT: lw $5, 8($1)
-; MIPS32R6-NEXT: andi $2, $2, 7
-; MIPS32R6-NEXT: not $6, $2
-; MIPS32R6-NEXT: srlv $7, $5, $2
-; MIPS32R6-NEXT: sllv $4, $4, $6
+; MIPS32R6-NEXT: srlv $4, $5, $2
+; MIPS32R6-NEXT: sll $6, $3, 1
+; MIPS32R6-NEXT: andi $7, $2, 31
+; MIPS32R6-NEXT: xori $7, $7, 31
+; MIPS32R6-NEXT: sllv $6, $6, $7
; MIPS32R6-NEXT: srlv $3, $3, $2
-; MIPS32R6-NEXT: lw $6, 0($1)
-; MIPS32R6-NEXT: sll $8, $6, 1
-; MIPS32R6-NEXT: xori $9, $2, 31
-; MIPS32R6-NEXT: sllv $8, $8, $9
-; MIPS32R6-NEXT: or $3, $3, $8
-; MIPS32R6-NEXT: or $4, $7, $4
+; MIPS32R6-NEXT: lw $8, 0($1)
+; MIPS32R6-NEXT: sll $9, $8, 1
+; MIPS32R6-NEXT: sllv $9, $9, $7
+; MIPS32R6-NEXT: or $3, $3, $9
+; MIPS32R6-NEXT: or $4, $4, $6
; MIPS32R6-NEXT: lw $1, 12($1)
; MIPS32R6-NEXT: srlv $1, $1, $2
; MIPS32R6-NEXT: sll $5, $5, 1
-; MIPS32R6-NEXT: sllv $5, $5, $9
+; MIPS32R6-NEXT: sllv $5, $5, $7
; MIPS32R6-NEXT: or $5, $1, $5
-; MIPS32R6-NEXT: srlv $2, $6, $2
+; MIPS32R6-NEXT: srlv $2, $8, $2
; MIPS32R6-NEXT: jr $ra
; MIPS32R6-NEXT: addiu $sp, $sp, 32
;
@@ -677,53 +639,37 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MMR3-NEXT: swp $16, 32($sp)
; MMR3-NEXT: .cfi_offset 17, -4
; MMR3-NEXT: .cfi_offset 16, -8
-; MMR3-NEXT: swl $7, 28($sp)
-; MMR3-NEXT: swl $6, 24($sp)
-; MMR3-NEXT: swl $5, 20($sp)
; MMR3-NEXT: li16 $2, 0
-; MMR3-NEXT: swl $4, 16($sp)
-; MMR3-NEXT: swl $2, 12($sp)
-; MMR3-NEXT: swl $2, 8($sp)
-; MMR3-NEXT: swl $2, 4($sp)
-; MMR3-NEXT: swl $2, 0($sp)
-; MMR3-NEXT: swr $7, 31($sp)
-; MMR3-NEXT: swr $6, 27($sp)
-; MMR3-NEXT: swr $5, 23($sp)
-; MMR3-NEXT: swr $4, 19($sp)
-; MMR3-NEXT: swr $2, 15($sp)
-; MMR3-NEXT: swr $2, 11($sp)
-; MMR3-NEXT: swr $2, 7($sp)
-; MMR3-NEXT: swr $2, 3($sp)
+; MMR3-NEXT: swp $6, 24($sp)
+; MMR3-NEXT: swp $4, 16($sp)
+; MMR3-NEXT: sw $2, 12($sp)
+; MMR3-NEXT: sw $2, 8($sp)
+; MMR3-NEXT: sw $2, 4($sp)
+; MMR3-NEXT: sw $2, 0($sp)
; MMR3-NEXT: addiur1sp $2, 0
; MMR3-NEXT: addiur2 $2, $2, 16
; MMR3-NEXT: lw $3, 68($sp)
-; MMR3-NEXT: ext $4, $3, 3, 4
-; MMR3-NEXT: subu16 $2, $2, $4
-; MMR3-NEXT: lwl $7, 4($2)
-; MMR3-NEXT: lwr $7, 7($2)
-; MMR3-NEXT: sll16 $4, $7, 1
-; MMR3-NEXT: lwl $5, 8($2)
-; MMR3-NEXT: lwr $5, 11($2)
-; MMR3-NEXT: andi16 $6, $3, 7
-; MMR3-NEXT: not16 $3, $6
-; MMR3-NEXT: andi16 $3, $3, 31
-; MMR3-NEXT: srlv $16, $5, $6
-; MMR3-NEXT: sllv $4, $4, $3
-; MMR3-NEXT: srlv $17, $7, $6
-; MMR3-NEXT: lwl $7, 0($2)
-; MMR3-NEXT: lwr $7, 3($2)
-; MMR3-NEXT: sll16 $3, $7, 1
-; MMR3-NEXT: xori $1, $6, 31
+; MMR3-NEXT: srl16 $4, $3, 3
+; MMR3-NEXT: andi $4, $4, 12
+; MMR3-NEXT: subu16 $5, $2, $4
+; MMR3-NEXT: lwp $6, 4($5)
+; MMR3-NEXT: andi16 $2, $3, 31
+; MMR3-NEXT: srlv $16, $7, $2
+; MMR3-NEXT: sll16 $3, $6, 1
+; MMR3-NEXT: xori $1, $2, 31
+; MMR3-NEXT: sllv $4, $3, $1
+; MMR3-NEXT: srlv $6, $6, $2
+; MMR3-NEXT: lw16 $17, 0($5)
+; MMR3-NEXT: sll16 $3, $17, 1
; MMR3-NEXT: sllv $3, $3, $1
-; MMR3-NEXT: or16 $3, $17
+; MMR3-NEXT: or16 $3, $6
; MMR3-NEXT: or16 $4, $16
-; MMR3-NEXT: lwl $8, 12($2)
-; MMR3-NEXT: lwr $8, 15($2)
-; MMR3-NEXT: srlv $2, $8, $6
-; MMR3-NEXT: sll16 $5, $5, 1
+; MMR3-NEXT: lw16 $5, 12($5)
+; MMR3-NEXT: srlv $6, $5, $2
+; MMR3-NEXT: sll16 $5, $7, 1
; MMR3-NEXT: sllv $5, $5, $1
-; MMR3-NEXT: or16 $5, $2
-; MMR3-NEXT: srlv $2, $7, $6
+; MMR3-NEXT: or16 $5, $6
+; MMR3-NEXT: srlv $2, $17, $2
; MMR3-NEXT: lwp $16, 32($sp)
; MMR3-NEXT: addiusp 40
; MMR3-NEXT: jrc $ra
@@ -746,29 +692,28 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MMR6-NEXT: addiu $2, $sp, 4
; MMR6-NEXT: addiur2 $2, $2, 16
; MMR6-NEXT: lw $3, 68($sp)
-; MMR6-NEXT: ext $4, $3, 3, 4
-; MMR6-NEXT: subu16 $5, $2, $4
-; MMR6-NEXT: lw16 $4, 4($5)
-; MMR6-NEXT: sll16 $6, $4, 1
-; MMR6-NEXT: lw16 $7, 8($5)
-; MMR6-NEXT: andi16 $2, $3, 7
-; MMR6-NEXT: not16 $3, $2
-; MMR6-NEXT: andi16 $3, $3, 31
-; MMR6-NEXT: srlv $1, $7, $2
-; MMR6-NEXT: sllv $6, $6, $3
-; MMR6-NEXT: srlv $3, $4, $2
-; MMR6-NEXT: lw16 $16, 0($5)
+; MMR6-NEXT: srl16 $4, $3, 3
+; MMR6-NEXT: andi $4, $4, 12
+; MMR6-NEXT: subu16 $2, $2, $4
+; MMR6-NEXT: lw16 $4, 4($2)
+; MMR6-NEXT: lw16 $5, 8($2)
+; MMR6-NEXT: andi16 $6, $3, 31
+; MMR6-NEXT: srlv $1, $5, $6
+; MMR6-NEXT: sll16 $3, $4, 1
+; MMR6-NEXT: xori $7, $6, 31
+; MMR6-NEXT: sllv $8, $3, $7
+; MMR6-NEXT: srlv $3, $4, $6
+; MMR6-NEXT: lw16 $16, 0($2)
; MMR6-NEXT: sll16 $4, $16, 1
-; MMR6-NEXT: xori $8, $2, 31
-; MMR6-NEXT: sllv $4, $4, $8
+; MMR6-NEXT: sllv $4, $4, $7
; MMR6-NEXT: or $3, $3, $4
-; MMR6-NEXT: or $4, $1, $6
-; MMR6-NEXT: lw16 $5, 12($5)
-; MMR6-NEXT: srlv $1, $5, $2
-; MMR6-NEXT: sll16 $5, $7, 1
-; MMR6-NEXT: sllv $5, $5, $8
-; MMR6-NEXT: or $5, $1, $5
-; MMR6-NEXT: srlv $2, $16, $2
+; MMR6-NEXT: or $4, $1, $8
+; MMR6-NEXT: lw16 $2, 12($2)
+; MMR6-NEXT: srlv $1, $2, $6
+; MMR6-NEXT: sll16 $2, $5, 1
+; MMR6-NEXT: sllv $2, $2, $7
+; MMR6-NEXT: or $5, $1, $2
+; MMR6-NEXT: srlv $2, $16, $6
; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload
; MMR6-NEXT: addiu $sp, $sp, 40
; MMR6-NEXT: jrc $ra
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
index 81f089a529470..394890a9dcc7c 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -440,49 +440,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS2: # %bb.0: # %entry
; MIPS2-NEXT: addiu $sp, $sp, -32
; MIPS2-NEXT: .cfi_def_cfa_offset 32
-; MIPS2-NEXT: swl $zero, 28($sp)
-; MIPS2-NEXT: swl $zero, 24($sp)
-; MIPS2-NEXT: swl $zero, 20($sp)
-; MIPS2-NEXT: swl $zero, 16($sp)
-; MIPS2-NEXT: swl $7, 12($sp)
-; MIPS2-NEXT: swl $6, 8($sp)
-; MIPS2-NEXT: swl $5, 4($sp)
-; MIPS2-NEXT: swl $4, 0($sp)
-; MIPS2-NEXT: swr $zero, 31($sp)
-; MIPS2-NEXT: swr $zero, 27($sp)
-; MIPS2-NEXT: swr $zero, 23($sp)
-; MIPS2-NEXT: swr $zero, 19($sp)
-; MIPS2-NEXT: swr $7, 15($sp)
-; MIPS2-NEXT: swr $6, 11($sp)
-; MIPS2-NEXT: swr $5, 7($sp)
-; MIPS2-NEXT: swr $4, 3($sp)
; MIPS2-NEXT: lw $1, 60($sp)
; MIPS2-NEXT: srl $2, $1, 3
-; MIPS2-NEXT: andi $2, $2, 15
+; MIPS2-NEXT: sw $7, 12($sp)
+; MIPS2-NEXT: sw $6, 8($sp)
+; MIPS2-NEXT: sw $5, 4($sp)
+; MIPS2-NEXT: sw $4, 0($sp)
+; MIPS2-NEXT: andi $2, $2, 12
; MIPS2-NEXT: addiu $3, $sp, 0
; MIPS2-NEXT: addu $4, $3, $2
-; MIPS2-NEXT: lwl $5, 8($4)
-; MIPS2-NEXT: lwr $5, 11($4)
-; MIPS2-NEXT: srl $2, $5, 1
-; MIPS2-NEXT: lwl $3, 4($4)
-; MIPS2-NEXT: lwr $3, 7($4)
-; MIPS2-NEXT: andi $1, $1, 7
-; MIPS2-NEXT: not $6, $1
-; MIPS2-NEXT: sllv $7, $3, $1
-; MIPS2-NEXT: srlv $6, $2, $6
-; MIPS2-NEXT: lwl $2, 0($4)
-; MIPS2-NEXT: lwr $2, 3($4)
-; MIPS2-NEXT: sllv $2, $2, $1
-; MIPS2-NEXT: srl $3, $3, 1
-; MIPS2-NEXT: xori $8, $1, 31
-; MIPS2-NEXT: srlv $3, $3, $8
-; MIPS2-NEXT: or $2, $2, $3
-; MIPS2-NEXT: or $3, $7, $6
+; MIPS2-NEXT: sw $zero, 28($sp)
+; MIPS2-NEXT: sw $zero, 24($sp)
+; MIPS2-NEXT: sw $zero, 20($sp)
+; MIPS2-NEXT: sw $zero, 16($sp)
+; MIPS2-NEXT: lw $5, 8($4)
+; MIPS2-NEXT: lw $2, 4($4)
+; MIPS2-NEXT: sllv $3, $2, $1
+; MIPS2-NEXT: srl $6, $5, 1
+; MIPS2-NEXT: andi $7, $1, 31
+; MIPS2-NEXT: xori $7, $7, 31
+; MIPS2-NEXT: srlv $6, $6, $7
+; MIPS2-NEXT: lw $8, 0($4)
+; MIPS2-NEXT: sllv $8, $8, $1
+; MIPS2-NEXT: srl $2, $2, 1
+; MIPS2-NEXT: srlv $2, $2, $7
+; MIPS2-NEXT: or $2, $8, $2
+; MIPS2-NEXT: or $3, $3, $6
; MIPS2-NEXT: sllv $5, $5, $1
-; MIPS2-NEXT: lwl $6, 12($4)
-; MIPS2-NEXT: lwr $6, 15($4)
+; MIPS2-NEXT: lw $6, 12($4)
; MIPS2-NEXT: srl $4, $6, 1
-; MIPS2-NEXT: srlv $4, $4, $8
+; MIPS2-NEXT: srlv $4, $4, $7
; MIPS2-NEXT: or $4, $5, $4
; MIPS2-NEXT: sllv $5, $6, $1
; MIPS2-NEXT: jr $ra
@@ -492,49 +479,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32: # %bb.0: # %entry
; MIPS32-NEXT: addiu $sp, $sp, -32
; MIPS32-NEXT: .cfi_def_cfa_offset 32
-; MIPS32-NEXT: swl $zero, 28($sp)
-; MIPS32-NEXT: swl $zero, 24($sp)
-; MIPS32-NEXT: swl $zero, 20($sp)
-; MIPS32-NEXT: swl $zero, 16($sp)
-; MIPS32-NEXT: swl $7, 12($sp)
-; MIPS32-NEXT: swl $6, 8($sp)
-; MIPS32-NEXT: swl $5, 4($sp)
-; MIPS32-NEXT: swl $4, 0($sp)
-; MIPS32-NEXT: swr $zero, 31($sp)
-; MIPS32-NEXT: swr $zero, 27($sp)
-; MIPS32-NEXT: swr $zero, 23($sp)
-; MIPS32-NEXT: swr $zero, 19($sp)
-; MIPS32-NEXT: swr $7, 15($sp)
-; MIPS32-NEXT: swr $6, 11($sp)
-; MIPS32-NEXT: swr $5, 7($sp)
-; MIPS32-NEXT: swr $4, 3($sp)
; MIPS32-NEXT: lw $1, 60($sp)
; MIPS32-NEXT: srl $2, $1, 3
-; MIPS32-NEXT: andi $2, $2, 15
+; MIPS32-NEXT: sw $7, 12($sp)
+; MIPS32-NEXT: sw $6, 8($sp)
+; MIPS32-NEXT: sw $5, 4($sp)
+; MIPS32-NEXT: sw $4, 0($sp)
+; MIPS32-NEXT: andi $2, $2, 12
; MIPS32-NEXT: addiu $3, $sp, 0
; MIPS32-NEXT: addu $4, $3, $2
-; MIPS32-NEXT: lwl $5, 8($4)
-; MIPS32-NEXT: lwr $5, 11($4)
-; MIPS32-NEXT: srl $2, $5, 1
-; MIPS32-NEXT: lwl $3, 4($4)
-; MIPS32-NEXT: lwr $3, 7($4)
-; MIPS32-NEXT: andi $1, $1, 7
-; MIPS32-NEXT: not $6, $1
-; MIPS32-NEXT: sllv $7, $3, $1
-; MIPS32-NEXT: srlv $6, $2, $6
-; MIPS32-NEXT: lwl $2, 0($4)
-; MIPS32-NEXT: lwr $2, 3($4)
-; MIPS32-NEXT: sllv $2, $2, $1
-; MIPS32-NEXT: srl $3, $3, 1
-; MIPS32-NEXT: xori $8, $1, 31
-; MIPS32-NEXT: srlv $3, $3, $8
-; MIPS32-NEXT: or $2, $2, $3
-; MIPS32-NEXT: or $3, $7, $6
+; MIPS32-NEXT: sw $zero, 28($sp)
+; MIPS32-NEXT: sw $zero, 24($sp)
+; MIPS32-NEXT: sw $zero, 20($sp)
+; MIPS32-NEXT: sw $zero, 16($sp)
+; MIPS32-NEXT: lw $5, 8($4)
+; MIPS32-NEXT: lw $2, 4($4)
+; MIPS32-NEXT: sllv $3, $2, $1
+; MIPS32-NEXT: srl $6, $5, 1
+; MIPS32-NEXT: andi $7, $1, 31
+; MIPS32-NEXT: xori $7, $7, 31
+; MIPS32-NEXT: srlv $6, $6, $7
+; MIPS32-NEXT: lw $8, 0($4)
+; MIPS32-NEXT: sllv $8, $8, $1
+; MIPS32-NEXT: srl $2, $2, 1
+; MIPS32-NEXT: srlv $2, $2, $7
+; MIPS32-NEXT: or $2, $8, $2
+; MIPS32-NEXT: or $3, $3, $6
; MIPS32-NEXT: sllv $5, $5, $1
-; MIPS32-NEXT: lwl $6, 12($4)
-; MIPS32-NEXT: lwr $6, 15($4)
+; MIPS32-NEXT: lw $6, 12($4)
; MIPS32-NEXT: srl $4, $6, 1
-; MIPS32-NEXT: srlv $4, $4, $8
+; MIPS32-NEXT: srlv $4, $4, $7
; MIPS32-NEXT: or $4, $5, $4
; MIPS32-NEXT: sllv $5, $6, $1
; MIPS32-NEXT: jr $ra
@@ -544,48 +518,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32R2: # %bb.0: # %entry
; MIPS32R2-NEXT: addiu $sp, $sp, -32
; MIPS32R2-NEXT: .cfi_def_cfa_offset 32
-; MIPS32R2-NEXT: swl $zero, 28($sp)
-; MIPS32R2-NEXT: swl $zero, 24($sp)
-; MIPS32R2-NEXT: swl $zero, 20($sp)
-; MIPS32R2-NEXT: swl $zero, 16($sp)
-; MIPS32R2-NEXT: swl $7, 12($sp)
-; MIPS32R2-NEXT: swl $6, 8($sp)
-; MIPS32R2-NEXT: swl $5, 4($sp)
-; MIPS32R2-NEXT: swl $4, 0($sp)
-; MIPS32R2-NEXT: swr $zero, 31($sp)
-; MIPS32R2-NEXT: swr $zero, 27($sp)
-; MIPS32R2-NEXT: swr $zero, 23($sp)
-; MIPS32R2-NEXT: swr $zero, 19($sp)
-; MIPS32R2-NEXT: swr $7, 15($sp)
-; MIPS32R2-NEXT: swr $6, 11($sp)
-; MIPS32R2-NEXT: swr $5, 7($sp)
-; MIPS32R2-NEXT: swr $4, 3($sp)
; MIPS32R2-NEXT: lw $1, 60($sp)
-; MIPS32R2-NEXT: ext $2, $1, 3, 4
+; MIPS32R2-NEXT: srl $2, $1, 3
+; MIPS32R2-NEXT: sw $7, 12($sp)
+; MIPS32R2-NEXT: sw $6, 8($sp)
+; MIPS32R2-NEXT: sw $5, 4($sp)
+; MIPS32R2-NEXT: sw $4, 0($sp)
+; MIPS32R2-NEXT: andi $2, $2, 12
; MIPS32R2-NEXT: addiu $3, $sp, 0
; MIPS32R2-NEXT: addu $4, $3, $2
-; MIPS32R2-NEXT: lwl $5, 8($4)
-; MIPS32R2-NEXT: lwr $5, 11($4)
-; MIPS32R2-NEXT: srl $2, $5, 1
-; MIPS32R2-NEXT: lwl $3, 4($4)
-; MIPS32R2-NEXT: lwr $3, 7($4)
-; MIPS32R2-NEXT: andi $1, $1, 7
-; MIPS32R2-NEXT: not $6, $1
-; MIPS32R2-NEXT: sllv $7, $3, $1
-; MIPS32R2-NEXT: srlv $6, $2, $6
-; MIPS32R2-NEXT: lwl $2, 0($4)
-; MIPS32R2-NEXT: lwr $2, 3($4)
-; MIPS32R2-NEXT: sllv $2, $2, $1
-; MIPS32R2-NEXT: srl $3, $3, 1
-; MIPS32R2-NEXT: xori $8, $1, 31
-; MIPS32R2-NEXT: srlv $3, $3, $8
-; MIPS32R2-NEXT: or $2, $2, $3
-; MIPS32R2-NEXT: or $3, $7, $6
+; MIPS32R2-NEXT: sw $zero, 28($sp)
+; MIPS32R2-NEXT: sw $zero, 24($sp)
+; MIPS32R2-NEXT: sw $zero, 20($sp)
+; MIPS32R2-NEXT: sw $zero, 16($sp)
+; MIPS32R2-NEXT: lw $5, 8($4)
+; MIPS32R2-NEXT: lw $2, 4($4)
+; MIPS32R2-NEXT: sllv $3, $2, $1
+; MIPS32R2-NEXT: srl $6, $5, 1
+; MIPS32R2-NEXT: andi $7, $1, 31
+; MIPS32R2-NEXT: xori $7, $7, 31
+; MIPS32R2-NEXT: srlv $6, $6, $7
+; MIPS32R2-NEXT: lw $8, 0($4)
+; MIPS32R2-NEXT: sllv $8, $8, $1
+; MIPS32R2-NEXT: srl $2, $2, 1
+; MIPS32R2-NEXT: srlv $2, $2, $7
+; MIPS32R2-NEXT: or $2, $8, $2
+; MIPS32R2-NEXT: or $3, $3, $6
; MIPS32R2-NEXT: sllv $5, $5, $1
-; MIPS32R2-NEXT: lwl $6, 12($4)
-; MIPS32R2-NEXT: lwr $6, 15($4)
+; MIPS32R2-NEXT: lw $6, 12($4)
; MIPS32R2-NEXT: srl $4, $6, 1
-; MIPS32R2-NEXT: srlv $4, $4, $8
+; MIPS32R2-NEXT: srlv $4, $4, $7
; MIPS32R2-NEXT: or $4, $5, $4
; MIPS32R2-NEXT: sllv $5, $6, $1
; MIPS32R2-NEXT: jr $ra
@@ -596,11 +558,12 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32R6-NEXT: addiu $sp, $sp, -32
; MIPS32R6-NEXT: .cfi_def_cfa_offset 32
; MIPS32R6-NEXT: lw $1, 60($sp)
+; MIPS32R6-NEXT: srl $2, $1, 3
; MIPS32R6-NEXT: sw $7, 12($sp)
; MIPS32R6-NEXT: sw $6, 8($sp)
; MIPS32R6-NEXT: sw $5, 4($sp)
; MIPS32R6-NEXT: sw $4, 0($sp)
-; MIPS32R6-NEXT: ext $2, $1, 3, 4
+; MIPS32R6-NEXT: andi $2, $2, 12
; MIPS32R6-NEXT: addiu $3, $sp, 0
; MIPS32R6-NEXT: addu $4, $3, $2
; MIPS32R6-NEXT: sw $zero, 28($sp)
@@ -608,23 +571,22 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32R6-NEXT: sw $zero, 20($sp)
; MIPS32R6-NEXT: sw $zero, 16($sp)
; MIPS32R6-NEXT: lw $5, 8($4)
-; MIPS32R6-NEXT: srl $2, $5, 1
-; MIPS32R6-NEXT: lw $3, 4($4)
-; MIPS32R6-NEXT: andi $1, $1, 7
-; MIPS32R6-NEXT: not $6, $1
-; MIPS32R6-NEXT: sllv $7, $3, $1
-; MIPS32R6-NEXT: srlv $6, $2, $6
-; MIPS32R6-NEXT: lw $2, 0($4)
-; MIPS32R6-NEXT: sllv $2, $2, $1
-; MIPS32R6-NEXT: srl $3, $3, 1
-; MIPS32R6-NEXT: xori $8, $1, 31
-; MIPS32R6-NEXT: srlv $3, $3, $8
-; MIPS32R6-NEXT: or $2, $2, $3
-; MIPS32R6-NEXT: or $3, $7, $6
+; MIPS32R6-NEXT: lw $2, 4($4)
+; MIPS32R6-NEXT: sllv $3, $2, $1
+; MIPS32R6-NEXT: srl $6, $5, 1
+; MIPS32R6-NEXT: andi $7, $1, 31
+; MIPS32R6-NEXT: xori $7, $7, 31
+; MIPS32R6-NEXT: srlv $6, $6, $7
+; MIPS32R6-NEXT: lw $8, 0($4)
+; MIPS32R6-NEXT: sllv $8, $8, $1
+; MIPS32R6-NEXT: srl $2, $2, 1
+; MIPS32R6-NEXT: srlv $2, $2, $7
+; MIPS32R6-NEXT: or $2, $8, $2
+; MIPS32R6-NEXT: or $3, $3, $6
; MIPS32R6-NEXT: sllv $5, $5, $1
; MIPS32R6-NEXT: lw $6, 12($4)
; MIPS32R6-NEXT: srl $4, $6, 1
-; MIPS32R6-NEXT: srlv $4, $4, $8
+; MIPS32R6-NEXT: srlv $4, $4, $7
; MIPS32R6-NEXT: or $4, $5, $4
; MIPS32R6-NEXT: sllv $5, $6, $1
; MIPS32R6-NEXT: jr $ra
@@ -722,47 +684,32 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MMR3-NEXT: .cfi_offset 17, -4
; MMR3-NEXT: .cfi_offset 16, -8
; MMR3-NEXT: li16 $2, 0
-; MMR3-NEXT: swl $2, 28($sp)
-; MMR3-NEXT: swl $2, 24($sp)
-; MMR3-NEXT: swl $2, 20($sp)
-; MMR3-NEXT: swl $2, 16($sp)
-; MMR3-NEXT: swl $7, 12($sp)
-; MMR3-NEXT: swl $6, 8($sp)
-; MMR3-NEXT: swl $5, 4($sp)
-; MMR3-NEXT: swl $4, 0($sp)
-; MMR3-NEXT: swr $2, 31($sp)
-; MMR3-NEXT: swr $2, 27($sp)
-; MMR3-NEXT: swr $2, 23($sp)
-; MMR3-NEXT: swr $2, 19($sp)
-; MMR3-NEXT: swr $7, 15($sp)
-; MMR3-NEXT: swr $6, 11($sp)
-; MMR3-NEXT: swr $5, 7($sp)
-; MMR3-NEXT: swr $4, 3($sp)
+; MMR3-NEXT: sw $2, 28($sp)
+; MMR3-NEXT: sw $2, 24($sp)
+; MMR3-NEXT: sw $2, 20($sp)
+; MMR3-NEXT: sw $2, 16($sp)
+; MMR3-NEXT: swp $6, 8($sp)
+; MMR3-NEXT: swp $4, 0($sp)
; MMR3-NEXT: lw $2, 68($sp)
-; MMR3-NEXT: ext $3, $2, 3, 4
+; MMR3-NEXT: srl16 $3, $2, 3
+; MMR3-NEXT: andi $3, $3, 12
; MMR3-NEXT: addiur1sp $4, 0
; MMR3-NEXT: addu16 $4, $4, $3
-; MMR3-NEXT: lwl $6, 8($4)
-; MMR3-NEXT: lwr $6, 11($4)
-; MMR3-NEXT: srl16 $3, $6, 1
-; MMR3-NEXT: lwl $7, 4($4)
-; MMR3-NEXT: lwr $7, 7($4)
-; MMR3-NEXT: andi16 $5, $2, 7
-; MMR3-NEXT: not16 $2, $5
-; MMR3-NEXT: andi16 $2, $2, 31
+; MMR3-NEXT: lw16 $6, 8($4)
+; MMR3-NEXT: lw16 $7, 4($4)
+; MMR3-NEXT: andi16 $5, $2, 31
; MMR3-NEXT: sllv $16, $7, $5
-; MMR3-NEXT: srlv $3, $3, $2
-; MMR3-NEXT: lwl $1, 0($4)
-; MMR3-NEXT: lwr $1, 3($4)
-; MMR3-NEXT: sllv $17, $1, $5
-; MMR3-NEXT: srl16 $2, $7, 1
+; MMR3-NEXT: srl16 $2, $6, 1
; MMR3-NEXT: xori $1, $5, 31
+; MMR3-NEXT: srlv $3, $2, $1
+; MMR3-NEXT: lw16 $2, 0($4)
+; MMR3-NEXT: sllv $17, $2, $5
+; MMR3-NEXT: srl16 $2, $7, 1
; MMR3-NEXT: srlv $2, $2, $1
; MMR3-NEXT: or16 $2, $17
; MMR3-NEXT: or16 $3, $16
; MMR3-NEXT: sllv $6, $6, $5
-; MMR3-NEXT: lwl $7, 12($4)
-; MMR3-NEXT: lwr $7, 15($4)
+; MMR3-NEXT: lw16 $7, 12($4)
; MMR3-NEXT: srl16 $4, $7, 1
; MMR3-NEXT: srlv $4, $4, $1
; MMR3-NEXT: or16 $4, $6
@@ -785,30 +732,29 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MMR6-NEXT: sw $5, 4($sp)
; MMR6-NEXT: sw $4, 0($sp)
; MMR6-NEXT: lw $2, 60($sp)
-; MMR6-NEXT: ext $3, $2, 3, 4
+; MMR6-NEXT: srl16 $3, $2, 3
+; MMR6-NEXT: andi $3, $3, 12
; MMR6-NEXT: addiu $4, $sp, 0
; MMR6-NEXT: addu16 $4, $4, $3
-; MMR6-NEXT: lw16 $6, 8($4)
-; MMR6-NEXT: srl16 $3, $6, 1
-; MMR6-NEXT: lw16 $7, 4($4)
-; MMR6-NEXT: andi16 $5, $2, 7
-; MMR6-NEXT: not16 $2, $5
-; MMR6-NEXT: andi16 $2, $2, 31
-; MMR6-NEXT: sllv $1, $7, $5
-; MMR6-NEXT: srlv $3, $3, $2
+; MMR6-NEXT: lw16 $5, 8($4)
+; MMR6-NEXT: lw16 $3, 4($4)
+; MMR6-NEXT: andi16 $6, $2, 31
+; MMR6-NEXT: sllv $1, $3, $6
+; MMR6-NEXT: srl16 $2, $5, 1
+; MMR6-NEXT: xori $7, $6, 31
+; MMR6-NEXT: srlv $8, $2, $7
; MMR6-NEXT: lw16 $2, 0($4)
-; MMR6-NEXT: sllv $2, $2, $5
-; MMR6-NEXT: srl16 $7, $7, 1
-; MMR6-NEXT: xori $8, $5, 31
-; MMR6-NEXT: srlv $7, $7, $8
-; MMR6-NEXT: or $2, $2, $7
-; MMR6-NEXT: or $3, $1, $3
-; MMR6-NEXT: sllv $1, $6, $5
-; MMR6-NEXT: lw16 $6, 12($4)
-; MMR6-NEXT: srl16 $4, $6, 1
-; MMR6-NEXT: srlv $4, $4, $8
+; MMR6-NEXT: sllv $2, $2, $6
+; MMR6-NEXT: srl16 $3, $3, 1
+; MMR6-NEXT: srlv $3, $3, $7
+; MMR6-NEXT: or $2, $2, $3
+; MMR6-NEXT: or $3, $1, $8
+; MMR6-NEXT: sllv $1, $5, $6
+; MMR6-NEXT: lw16 $5, 12($4)
+; MMR6-NEXT: srl16 $4, $5, 1
+; MMR6-NEXT: srlv $4, $4, $7
; MMR6-NEXT: or $4, $1, $4
-; MMR6-NEXT: sllv $5, $6, $5
+; MMR6-NEXT: sllv $5, $5, $6
; MMR6-NEXT: addiu $sp, $sp, 32
; MMR6-NEXT: jrc $ra
entry:
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
index c48361e0a8035..ae25feeb8893c 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
@@ -8,58 +8,52 @@ define void @foo1(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
; CHECK-LABEL: foo1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: stwu 1, -64(1)
-; CHECK-NEXT: stw 28, 48(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 8, 2048
; CHECK-NEXT: stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 6, 0
+; CHECK-NEXT: li 7, 2048
; CHECK-NEXT: stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 7, 7
-; CHECK-NEXT: mtctr 8
-; CHECK-NEXT: addi 8, 1, 16
+; CHECK-NEXT: li 6, 0
+; CHECK-NEXT: mtctr 7
+; CHECK-NEXT: addi 7, 1, 20
; CHECK-NEXT: .LBB0_1: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: lwz 9, 0(4)
-; CHECK-NEXT: lwz 10, 4(4)
-; CHECK-NEXT: lwz 11, 8(4)
-; CHECK-NEXT: lwz 12, 12(4)
-; CHECK-NEXT: lwz 0, 12(5)
+; CHECK-NEXT: lwz 8, 0(4)
+; CHECK-NEXT: lwz 9, 4(4)
+; CHECK-NEXT: lwz 10, 8(4)
+; CHECK-NEXT: lwz 11, 12(4)
+; CHECK-NEXT: lwz 12, 12(5)
+; CHECK-NEXT: stw 6, 48(1)
; CHECK-NEXT: stw 6, 44(1)
; CHECK-NEXT: stw 6, 40(1)
; CHECK-NEXT: stw 6, 36(1)
-; CHECK-NEXT: stw 6, 32(1)
-; CHECK-NEXT: stw 12, 28(1)
-; CHECK-NEXT: clrlwi 12, 0, 29
-; CHECK-NEXT: stw 11, 24(1)
-; CHECK-NEXT: nand 11, 0, 7
-; CHECK-NEXT: stw 10, 20(1)
-; CHECK-NEXT: subfic 29, 12, 32
-; CHECK-NEXT: stw 9, 16(1)
-; CHECK-NEXT: rlwinm 9, 0, 29, 28, 31
-; CHECK-NEXT: lwzux 10, 9, 8
-; CHECK-NEXT: clrlwi 11, 11, 27
-; CHECK-NEXT: lwz 0, 8(9)
-; CHECK-NEXT: slw 10, 10, 12
-; CHECK-NEXT: lwz 30, 4(9)
-; CHECK-NEXT: lwz 9, 12(9)
-; CHECK-NEXT: slw 28, 30, 12
-; CHECK-NEXT: srw 30, 30, 29
-; CHECK-NEXT: srw 29, 9, 29
-; CHECK-NEXT: slw 9, 9, 12
-; CHECK-NEXT: slw 12, 0, 12
-; CHECK-NEXT: srwi 0, 0, 1
-; CHECK-NEXT: stw 9, 12(3)
-; CHECK-NEXT: or 9, 12, 29
-; CHECK-NEXT: srw 11, 0, 11
-; CHECK-NEXT: stw 9, 8(3)
-; CHECK-NEXT: or 9, 10, 30
-; CHECK-NEXT: stw 9, 0(3)
-; CHECK-NEXT: or 9, 28, 11
-; CHECK-NEXT: stw 9, 4(3)
+; CHECK-NEXT: stw 11, 32(1)
+; CHECK-NEXT: stw 10, 28(1)
+; CHECK-NEXT: clrlwi 10, 12, 27
+; CHECK-NEXT: stw 9, 24(1)
+; CHECK-NEXT: stw 8, 20(1)
+; CHECK-NEXT: rlwinm 8, 12, 29, 28, 29
+; CHECK-NEXT: lwzux 9, 8, 7
+; CHECK-NEXT: subfic 12, 10, 32
+; CHECK-NEXT: lwz 11, 8(8)
+; CHECK-NEXT: slw 9, 9, 10
+; CHECK-NEXT: lwz 0, 4(8)
+; CHECK-NEXT: lwz 8, 12(8)
+; CHECK-NEXT: srw 30, 11, 12
+; CHECK-NEXT: slw 29, 0, 10
+; CHECK-NEXT: srw 0, 0, 12
+; CHECK-NEXT: srw 12, 8, 12
+; CHECK-NEXT: slw 11, 11, 10
+; CHECK-NEXT: slw 8, 8, 10
+; CHECK-NEXT: stw 8, 12(3)
+; CHECK-NEXT: or 8, 11, 12
+; CHECK-NEXT: stw 8, 8(3)
+; CHECK-NEXT: or 8, 9, 0
+; CHECK-NEXT: stw 8, 0(3)
+; CHECK-NEXT: or 8, 29, 30
+; CHECK-NEXT: stw 8, 4(3)
; CHECK-NEXT: bdnz .LBB0_1
; CHECK-NEXT: # %bb.2: # %for.end
; CHECK-NEXT: lwz 30, 56(1) # 4-byte Folded Reload
; CHECK-NEXT: lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT: lwz 28, 48(1) # 4-byte Folded Reload
; CHECK-NEXT: addi 1, 1, 64
; CHECK-NEXT: blr
entry:
@@ -83,59 +77,53 @@ for.end: ; preds = %for.body
define void @foo2(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
; CHECK-LABEL: foo2:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stwu 1, -64(1)
-; CHECK-NEXT: stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 7, 2048
-; CHECK-NEXT: stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 6, 7
-; CHECK-NEXT: mtctr 7
-; CHECK-NEXT: addi 7, 1, 36
+; CHECK-NEXT: stwu 1, -48(1)
+; CHECK-NEXT: stw 30, 40(1) # 4-byte Folded Spill
+; CHECK-NEXT: li 6, 2048
+; CHECK-NEXT: mtctr 6
+; CHECK-NEXT: addi 6, 1, 24
; CHECK-NEXT: .LBB1_1: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: lwz 8, 0(4)
-; CHECK-NEXT: lwz 10, 8(4)
-; CHECK-NEXT: lwz 12, 12(5)
-; CHECK-NEXT: lwz 9, 4(4)
-; CHECK-NEXT: lwz 11, 12(4)
-; CHECK-NEXT: stw 10, 44(1)
-; CHECK-NEXT: rlwinm 10, 12, 29, 28, 31
-; CHECK-NEXT: stw 8, 36(1)
-; CHECK-NEXT: srawi 8, 8, 31
-; CHECK-NEXT: stw 11, 48(1)
-; CHECK-NEXT: clrlwi 11, 12, 29
-; CHECK-NEXT: stw 9, 40(1)
-; CHECK-NEXT: nand 9, 12, 6
-; CHECK-NEXT: stw 8, 32(1)
-; CHECK-NEXT: subfic 30, 11, 32
+; CHECK-NEXT: lwz 7, 0(4)
+; CHECK-NEXT: lwz 8, 4(4)
+; CHECK-NEXT: lwz 11, 12(5)
+; CHECK-NEXT: lwz 9, 8(4)
+; CHECK-NEXT: lwz 10, 12(4)
; CHECK-NEXT: stw 8, 28(1)
-; CHECK-NEXT: clrlwi 9, 9, 27
-; CHECK-NEXT: stw 8, 24(1)
-; CHECK-NEXT: stw 8, 20(1)
-; CHECK-NEXT: sub 8, 7, 10
-; CHECK-NEXT: lwz 10, 4(8)
-; CHECK-NEXT: lwz 12, 8(8)
-; CHECK-NEXT: lwz 0, 0(8)
-; CHECK-NEXT: lwz 8, 12(8)
-; CHECK-NEXT: srw 29, 12, 11
-; CHECK-NEXT: slw 12, 12, 30
-; CHECK-NEXT: slw 30, 0, 30
-; CHECK-NEXT: srw 8, 8, 11
-; CHECK-NEXT: sraw 0, 0, 11
-; CHECK-NEXT: srw 11, 10, 11
-; CHECK-NEXT: slwi 10, 10, 1
-; CHECK-NEXT: or 8, 12, 8
-; CHECK-NEXT: slw 9, 10, 9
-; CHECK-NEXT: stw 8, 12(3)
-; CHECK-NEXT: or 8, 30, 11
-; CHECK-NEXT: stw 8, 4(3)
-; CHECK-NEXT: or 8, 29, 9
-; CHECK-NEXT: stw 0, 0(3)
-; CHECK-NEXT: stw 8, 8(3)
+; CHECK-NEXT: rlwinm 8, 11, 29, 28, 29
+; CHECK-NEXT: stw 7, 24(1)
+; CHECK-NEXT: srawi 7, 7, 31
+; CHECK-NEXT: stw 10, 36(1)
+; CHECK-NEXT: clrlwi 10, 11, 27
+; CHECK-NEXT: stw 9, 32(1)
+; CHECK-NEXT: subfic 12, 10, 32
+; CHECK-NEXT: stw 7, 20(1)
+; CHECK-NEXT: stw 7, 16(1)
+; CHECK-NEXT: stw 7, 12(1)
+; CHECK-NEXT: stw 7, 8(1)
+; CHECK-NEXT: sub 7, 6, 8
+; CHECK-NEXT: lwz 8, 4(7)
+; CHECK-NEXT: lwz 9, 0(7)
+; CHECK-NEXT: lwz 11, 12(7)
+; CHECK-NEXT: srw 0, 8, 10
+; CHECK-NEXT: lwz 7, 8(7)
+; CHECK-NEXT: slw 30, 9, 12
+; CHECK-NEXT: slw 8, 8, 12
+; CHECK-NEXT: srw 11, 11, 10
+; CHECK-NEXT: slw 12, 7, 12
+; CHECK-NEXT: srw 7, 7, 10
+; CHECK-NEXT: or 7, 8, 7
+; CHECK-NEXT: stw 7, 8(3)
+; CHECK-NEXT: or 7, 12, 11
+; CHECK-NEXT: sraw 9, 9, 10
+; CHECK-NEXT: stw 7, 12(3)
+; CHECK-NEXT: or 7, 30, 0
+; CHECK-NEXT: stw 9, 0(3)
+; CHECK-NEXT: stw 7, 4(3)
; CHECK-NEXT: bdnz .LBB1_1
; CHECK-NEXT: # %bb.2: # %for.end
-; CHECK-NEXT: lwz 30, 56(1) # 4-byte Folded Reload
-; CHECK-NEXT: lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT: addi 1, 1, 64
+; CHECK-NEXT: lwz 30, 40(1) # 4-byte Folded Reload
+; CHECK-NEXT: addi 1, 1, 48
; CHECK-NEXT: blr
entry:
br label %for.body
@@ -159,59 +147,53 @@ define void @foo3(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
; CHECK-LABEL: foo3:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: stwu 1, -64(1)
-; CHECK-NEXT: stw 28, 48(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 8, 2048
; CHECK-NEXT: stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 6, 0
+; CHECK-NEXT: li 7, 2048
; CHECK-NEXT: stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 7, 7
-; CHECK-NEXT: mtctr 8
-; CHECK-NEXT: addi 8, 1, 32
+; CHECK-NEXT: li 6, 0
+; CHECK-NEXT: mtctr 7
+; CHECK-NEXT: addi 7, 1, 36
; CHECK-NEXT: .LBB2_1: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: lwz 10, 4(4)
-; CHECK-NEXT: lwz 0, 12(5)
-; CHECK-NEXT: lwz 9, 0(4)
-; CHECK-NEXT: lwz 11, 8(4)
-; CHECK-NEXT: lwz 12, 12(4)
-; CHECK-NEXT: stw 10, 36(1)
-; CHECK-NEXT: rlwinm 10, 0, 29, 28, 31
+; CHECK-NEXT: lwz 8, 0(4)
+; CHECK-NEXT: lwz 12, 12(5)
+; CHECK-NEXT: lwz 9, 4(4)
+; CHECK-NEXT: lwz 10, 8(4)
+; CHECK-NEXT: lwz 11, 12(4)
+; CHECK-NEXT: stw 8, 36(1)
+; CHECK-NEXT: rlwinm 8, 12, 29, 28, 29
+; CHECK-NEXT: stw 6, 32(1)
+; CHECK-NEXT: sub 8, 7, 8
; CHECK-NEXT: stw 6, 28(1)
-; CHECK-NEXT: sub 10, 8, 10
; CHECK-NEXT: stw 6, 24(1)
; CHECK-NEXT: stw 6, 20(1)
-; CHECK-NEXT: stw 6, 16(1)
-; CHECK-NEXT: stw 12, 44(1)
-; CHECK-NEXT: clrlwi 12, 0, 29
-; CHECK-NEXT: stw 11, 40(1)
-; CHECK-NEXT: subfic 29, 12, 32
-; CHECK-NEXT: stw 9, 32(1)
-; CHECK-NEXT: nand 9, 0, 7
-; CHECK-NEXT: lwz 11, 4(10)
-; CHECK-NEXT: clrlwi 9, 9, 27
-; CHECK-NEXT: lwz 0, 8(10)
-; CHECK-NEXT: lwz 30, 0(10)
-; CHECK-NEXT: lwz 10, 12(10)
-; CHECK-NEXT: srw 28, 0, 12
-; CHECK-NEXT: slw 0, 0, 29
-; CHECK-NEXT: slw 29, 30, 29
-; CHECK-NEXT: srw 10, 10, 12
-; CHECK-NEXT: srw 30, 30, 12
-; CHECK-NEXT: srw 12, 11, 12
-; CHECK-NEXT: slwi 11, 11, 1
-; CHECK-NEXT: slw 9, 11, 9
-; CHECK-NEXT: or 10, 0, 10
-; CHECK-NEXT: stw 10, 12(3)
-; CHECK-NEXT: or 10, 29, 12
-; CHECK-NEXT: or 9, 28, 9
-; CHECK-NEXT: stw 30, 0(3)
-; CHECK-NEXT: stw 10, 4(3)
-; CHECK-NEXT: stw 9, 8(3)
+; CHECK-NEXT: stw 11, 48(1)
+; CHECK-NEXT: clrlwi 11, 12, 27
+; CHECK-NEXT: stw 10, 44(1)
+; CHECK-NEXT: subfic 0, 11, 32
+; CHECK-NEXT: stw 9, 40(1)
+; CHECK-NEXT: lwz 9, 4(8)
+; CHECK-NEXT: lwz 10, 0(8)
+; CHECK-NEXT: lwz 12, 12(8)
+; CHECK-NEXT: srw 30, 9, 11
+; CHECK-NEXT: lwz 8, 8(8)
+; CHECK-NEXT: slw 29, 10, 0
+; CHECK-NEXT: slw 9, 9, 0
+; CHECK-NEXT: srw 12, 12, 11
+; CHECK-NEXT: slw 0, 8, 0
+; CHECK-NEXT: srw 8, 8, 11
+; CHECK-NEXT: or 8, 9, 8
+; CHECK-NEXT: stw 8, 8(3)
+; CHECK-NEXT: or 8, 0, 12
+; CHECK-NEXT: srw 10, 10, 11
+; CHECK-NEXT: stw 8, 12(3)
+; CHECK-NEXT: or 8, 29, 30
+; CHECK-NEXT: stw 10, 0(3)
+; CHECK-NEXT: stw 8, 4(3)
; CHECK-NEXT: bdnz .LBB2_1
; CHECK-NEXT: # %bb.2: # %for.end
; CHECK-NEXT: lwz 30, 56(1) # 4-byte Folded Reload
; CHECK-NEXT: lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT: lwz 28, 48(1) # 4-byte Folded Reload
; CHECK-NEXT: addi 1, 1, 64
; CHECK-NEXT: blr
entry:
diff --git a/llvm/test/CodeGen/PowerPC/pr59074.ll b/llvm/test/CodeGen/PowerPC/pr59074.ll
index 3e328c6ad9f0b..cc90300aafcea 100644
--- a/llvm/test/CodeGen/PowerPC/pr59074.ll
+++ b/llvm/test/CodeGen/PowerPC/pr59074.ll
@@ -33,36 +33,34 @@ define void @pr59074(ptr %0) {
; LE32-NEXT: li 8, 12
; LE32-NEXT: xxswapd 0, 0
; LE32-NEXT: addi 4, 4, -12
-; LE32-NEXT: rlwinm 9, 4, 29, 28, 31
+; LE32-NEXT: rlwinm 9, 4, 29, 28, 29
; LE32-NEXT: stxvd2x 0, 6, 5
; LE32-NEXT: stw 7, 44(1)
; LE32-NEXT: stw 7, 40(1)
; LE32-NEXT: stw 7, 36(1)
; LE32-NEXT: stw 8, 16(1)
+; LE32-NEXT: clrlwi 4, 4, 27
; LE32-NEXT: lwzux 5, 9, 6
-; LE32-NEXT: li 6, 7
-; LE32-NEXT: lwz 7, 8(9)
-; LE32-NEXT: nand 6, 4, 6
-; LE32-NEXT: lwz 8, 4(9)
-; LE32-NEXT: clrlwi 4, 4, 29
-; LE32-NEXT: lwz 9, 12(9)
-; LE32-NEXT: clrlwi 6, 6, 27
; LE32-NEXT: subfic 11, 4, 32
+; LE32-NEXT: lwz 6, 8(9)
+; LE32-NEXT: lwz 7, 4(9)
+; LE32-NEXT: lwz 8, 12(9)
+; LE32-NEXT: xori 9, 4, 31
; LE32-NEXT: srw 5, 5, 4
-; LE32-NEXT: slwi 10, 7, 1
-; LE32-NEXT: srw 7, 7, 4
-; LE32-NEXT: slw 6, 10, 6
-; LE32-NEXT: srw 10, 8, 4
-; LE32-NEXT: slw 8, 8, 11
-; LE32-NEXT: slw 11, 9, 11
-; LE32-NEXT: srw 4, 9, 4
-; LE32-NEXT: or 5, 8, 5
-; LE32-NEXT: or 7, 11, 7
-; LE32-NEXT: or 6, 10, 6
+; LE32-NEXT: slwi 10, 6, 1
+; LE32-NEXT: srw 6, 6, 4
+; LE32-NEXT: slw 9, 10, 9
+; LE32-NEXT: srw 10, 7, 4
+; LE32-NEXT: slw 7, 7, 11
+; LE32-NEXT: slw 11, 8, 11
+; LE32-NEXT: srw 4, 8, 4
+; LE32-NEXT: or 5, 7, 5
+; LE32-NEXT: or 6, 11, 6
+; LE32-NEXT: or 7, 10, 9
; LE32-NEXT: stw 4, 12(3)
-; LE32-NEXT: stw 7, 8(3)
+; LE32-NEXT: stw 6, 8(3)
; LE32-NEXT: stw 5, 0(3)
-; LE32-NEXT: stw 6, 4(3)
+; LE32-NEXT: stw 7, 4(3)
; LE32-NEXT: addi 1, 1, 80
; LE32-NEXT: blr
;
@@ -89,37 +87,33 @@ define void @pr59074(ptr %0) {
; BE32-NEXT: li 6, 12
; BE32-NEXT: li 7, 0
; BE32-NEXT: addi 8, 1, -48
-; BE32-NEXT: li 10, 7
; BE32-NEXT: stxvw4x 0, 0, 5
-; BE32-NEXT: addi 4, 4, -12
; BE32-NEXT: stw 6, -36(1)
+; BE32-NEXT: addi 4, 4, -12
; BE32-NEXT: stw 7, -40(1)
; BE32-NEXT: stw 7, -44(1)
-; BE32-NEXT: rlwinm 9, 4, 29, 28, 31
; BE32-NEXT: stw 7, -48(1)
+; BE32-NEXT: rlwinm 9, 4, 29, 28, 29
+; BE32-NEXT: clrlwi 4, 4, 27
; BE32-NEXT: sub 5, 8, 9
-; BE32-NEXT: nand 6, 4, 10
-; BE32-NEXT: clrlwi 4, 4, 29
-; BE32-NEXT: clrlwi 6, 6, 27
-; BE32-NEXT: lwz 7, 4(5)
-; BE32-NEXT: lwz 8, 8(5)
-; BE32-NEXT: lwz 9, 0(5)
-; BE32-NEXT: lwz 5, 12(5)
-; BE32-NEXT: slwi 10, 7, 1
-; BE32-NEXT: srw 11, 8, 4
-; BE32-NEXT: srw 7, 7, 4
-; BE32-NEXT: srw 5, 5, 4
-; BE32-NEXT: slw 6, 10, 6
+; BE32-NEXT: lwz 6, 4(5)
+; BE32-NEXT: lwz 7, 0(5)
+; BE32-NEXT: lwz 8, 12(5)
+; BE32-NEXT: lwz 5, 8(5)
; BE32-NEXT: subfic 10, 4, 32
-; BE32-NEXT: srw 4, 9, 4
-; BE32-NEXT: slw 8, 8, 10
-; BE32-NEXT: slw 10, 9, 10
-; BE32-NEXT: or 6, 11, 6
-; BE32-NEXT: or 7, 10, 7
-; BE32-NEXT: or 5, 8, 5
+; BE32-NEXT: srw 9, 6, 4
+; BE32-NEXT: slw 11, 7, 10
+; BE32-NEXT: srw 8, 8, 4
+; BE32-NEXT: slw 6, 6, 10
+; BE32-NEXT: slw 10, 5, 10
+; BE32-NEXT: srw 5, 5, 4
+; BE32-NEXT: srw 4, 7, 4
+; BE32-NEXT: or 7, 11, 9
+; BE32-NEXT: or 8, 10, 8
+; BE32-NEXT: or 5, 6, 5
; BE32-NEXT: stw 4, 0(3)
-; BE32-NEXT: stw 6, 8(3)
-; BE32-NEXT: stw 5, 12(3)
+; BE32-NEXT: stw 5, 8(3)
+; BE32-NEXT: stw 8, 12(3)
; BE32-NEXT: stw 7, 4(3)
; BE32-NEXT: blr
entry:
diff --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
index 044ddf562294c..98c76a7d3887c 100644
--- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
@@ -209,45 +209,41 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stwu 1, -48(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
; LE-32BIT-NEXT: li 6, 0
-; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: lwz 8, 4(3)
; LE-32BIT-NEXT: lwz 9, 8(3)
; LE-32BIT-NEXT: lwz 3, 12(3)
+; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: stw 6, 28(1)
; LE-32BIT-NEXT: stw 6, 24(1)
; LE-32BIT-NEXT: stw 6, 20(1)
; LE-32BIT-NEXT: stw 6, 16(1)
-; LE-32BIT-NEXT: addi 6, 1, 32
-; LE-32BIT-NEXT: stw 7, 32(1)
-; LE-32BIT-NEXT: rlwinm 7, 4, 29, 28, 31
+; LE-32BIT-NEXT: rlwinm 6, 4, 29, 28, 29
; LE-32BIT-NEXT: stw 3, 44(1)
-; LE-32BIT-NEXT: sub 6, 6, 7
+; LE-32BIT-NEXT: addi 3, 1, 32
; LE-32BIT-NEXT: stw 9, 40(1)
-; LE-32BIT-NEXT: li 3, 7
+; LE-32BIT-NEXT: sub 3, 3, 6
; LE-32BIT-NEXT: stw 8, 36(1)
-; LE-32BIT-NEXT: nand 3, 4, 3
-; LE-32BIT-NEXT: lwz 7, 4(6)
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
-; LE-32BIT-NEXT: lwz 8, 8(6)
-; LE-32BIT-NEXT: subfic 10, 4, 32
-; LE-32BIT-NEXT: lwz 9, 0(6)
-; LE-32BIT-NEXT: clrlwi 3, 3, 27
-; LE-32BIT-NEXT: lwz 6, 12(6)
-; LE-32BIT-NEXT: srw 11, 8, 4
-; LE-32BIT-NEXT: slw 8, 8, 10
-; LE-32BIT-NEXT: slw 10, 9, 10
-; LE-32BIT-NEXT: srw 6, 6, 4
-; LE-32BIT-NEXT: srw 9, 9, 4
-; LE-32BIT-NEXT: srw 4, 7, 4
-; LE-32BIT-NEXT: slwi 7, 7, 1
-; LE-32BIT-NEXT: slw 3, 7, 3
-; LE-32BIT-NEXT: or 6, 8, 6
-; LE-32BIT-NEXT: or 4, 10, 4
-; LE-32BIT-NEXT: or 3, 11, 3
-; LE-32BIT-NEXT: stw 9, 0(5)
-; LE-32BIT-NEXT: stw 6, 12(5)
-; LE-32BIT-NEXT: stw 4, 4(5)
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
+; LE-32BIT-NEXT: stw 7, 32(1)
+; LE-32BIT-NEXT: subfic 9, 4, 32
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 10, 6, 4
+; LE-32BIT-NEXT: lwz 3, 8(3)
+; LE-32BIT-NEXT: slw 11, 7, 9
+; LE-32BIT-NEXT: slw 6, 6, 9
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 9, 3, 9
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: or 3, 6, 3
; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 9, 8
+; LE-32BIT-NEXT: srw 4, 7, 4
+; LE-32BIT-NEXT: stw 3, 12(5)
+; LE-32BIT-NEXT: or 3, 11, 10
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
@@ -304,34 +300,30 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 6, 40(1)
; LE-32BIT-NEXT: stw 6, 36(1)
; LE-32BIT-NEXT: stw 6, 32(1)
-; LE-32BIT-NEXT: rlwinm 6, 4, 29, 28, 31
+; LE-32BIT-NEXT: rlwinm 6, 4, 29, 28, 29
; LE-32BIT-NEXT: stw 3, 28(1)
; LE-32BIT-NEXT: addi 3, 1, 16
; LE-32BIT-NEXT: stw 9, 24(1)
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
; LE-32BIT-NEXT: stw 8, 20(1)
+; LE-32BIT-NEXT: subfic 8, 4, 32
; LE-32BIT-NEXT: stw 7, 16(1)
-; LE-32BIT-NEXT: li 7, 7
; LE-32BIT-NEXT: lwzux 3, 6, 3
-; LE-32BIT-NEXT: nand 7, 4, 7
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
-; LE-32BIT-NEXT: subfic 10, 4, 32
-; LE-32BIT-NEXT: lwz 8, 8(6)
-; LE-32BIT-NEXT: clrlwi 7, 7, 27
; LE-32BIT-NEXT: lwz 9, 4(6)
; LE-32BIT-NEXT: slw 3, 3, 4
+; LE-32BIT-NEXT: lwz 7, 8(6)
; LE-32BIT-NEXT: lwz 6, 12(6)
; LE-32BIT-NEXT: slw 11, 9, 4
-; LE-32BIT-NEXT: srw 9, 9, 10
-; LE-32BIT-NEXT: srw 10, 6, 10
-; LE-32BIT-NEXT: slw 6, 6, 4
-; LE-32BIT-NEXT: slw 4, 8, 4
-; LE-32BIT-NEXT: srwi 8, 8, 1
-; LE-32BIT-NEXT: srw 7, 8, 7
+; LE-32BIT-NEXT: srw 9, 9, 8
+; LE-32BIT-NEXT: srw 10, 7, 8
+; LE-32BIT-NEXT: srw 8, 6, 8
+; LE-32BIT-NEXT: slw 7, 7, 4
+; LE-32BIT-NEXT: slw 4, 6, 4
; LE-32BIT-NEXT: or 3, 3, 9
-; LE-32BIT-NEXT: or 4, 4, 10
+; LE-32BIT-NEXT: stw 4, 12(5)
+; LE-32BIT-NEXT: or 4, 7, 8
; LE-32BIT-NEXT: stw 3, 0(5)
-; LE-32BIT-NEXT: or 3, 11, 7
-; LE-32BIT-NEXT: stw 6, 12(5)
+; LE-32BIT-NEXT: or 3, 11, 10
; LE-32BIT-NEXT: stw 4, 8(5)
; LE-32BIT-NEXT: stw 3, 4(5)
; LE-32BIT-NEXT: addi 1, 1, 48
@@ -387,46 +379,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -48(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
-; LE-32BIT-NEXT: li 6, 7
+; LE-32BIT-NEXT: addi 6, 1, 32
; LE-32BIT-NEXT: lwz 8, 4(3)
; LE-32BIT-NEXT: lwz 9, 8(3)
; LE-32BIT-NEXT: lwz 3, 12(3)
; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: stw 3, 44(1)
; LE-32BIT-NEXT: srawi 3, 7, 31
-; LE-32BIT-NEXT: stw 8, 36(1)
-; LE-32BIT-NEXT: rlwinm 8, 4, 29, 28, 31
; LE-32BIT-NEXT: stw 7, 32(1)
-; LE-32BIT-NEXT: addi 7, 1, 32
+; LE-32BIT-NEXT: rlwinm 7, 4, 29, 28, 29
; LE-32BIT-NEXT: stw 9, 40(1)
-; LE-32BIT-NEXT: nand 6, 4, 6
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
+; LE-32BIT-NEXT: stw 8, 36(1)
+; LE-32BIT-NEXT: subfic 9, 4, 32
; LE-32BIT-NEXT: stw 3, 28(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
; LE-32BIT-NEXT: stw 3, 24(1)
-; LE-32BIT-NEXT: subfic 10, 4, 32
; LE-32BIT-NEXT: stw 3, 20(1)
-; LE-32BIT-NEXT: clrlwi 6, 6, 27
; LE-32BIT-NEXT: stw 3, 16(1)
-; LE-32BIT-NEXT: sub 3, 7, 8
-; LE-32BIT-NEXT: lwz 7, 4(3)
-; LE-32BIT-NEXT: lwz 8, 8(3)
-; LE-32BIT-NEXT: lwz 9, 0(3)
-; LE-32BIT-NEXT: lwz 3, 12(3)
-; LE-32BIT-NEXT: srw 11, 8, 4
-; LE-32BIT-NEXT: slw 8, 8, 10
-; LE-32BIT-NEXT: slw 10, 9, 10
+; LE-32BIT-NEXT: sub 3, 6, 7
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 10, 6, 4
+; LE-32BIT-NEXT: lwz 3, 8(3)
+; LE-32BIT-NEXT: slw 11, 7, 9
+; LE-32BIT-NEXT: slw 6, 6, 9
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 9, 3, 9
; LE-32BIT-NEXT: srw 3, 3, 4
-; LE-32BIT-NEXT: sraw 9, 9, 4
-; LE-32BIT-NEXT: srw 4, 7, 4
-; LE-32BIT-NEXT: slwi 7, 7, 1
-; LE-32BIT-NEXT: or 3, 8, 3
-; LE-32BIT-NEXT: slw 6, 7, 6
+; LE-32BIT-NEXT: or 3, 6, 3
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 9, 8
+; LE-32BIT-NEXT: sraw 4, 7, 4
; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: or 3, 10, 4
+; LE-32BIT-NEXT: or 3, 11, 10
+; LE-32BIT-NEXT: stw 4, 0(5)
; LE-32BIT-NEXT: stw 3, 4(5)
-; LE-32BIT-NEXT: or 3, 11, 6
-; LE-32BIT-NEXT: stw 9, 0(5)
-; LE-32BIT-NEXT: stw 3, 8(5)
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
@@ -449,32 +437,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-64BIT-NEXT: li 4, 48
; LE-64BIT-NEXT: stxvd2x 2, 7, 4
; LE-64BIT-NEXT: stxvd2x 2, 7, 8
-; LE-64BIT-NEXT: rlwinm 4, 3, 29, 27, 31
+; LE-64BIT-NEXT: rlwinm 4, 3, 29, 27, 28
+; LE-64BIT-NEXT: clrlwi 3, 3, 26
; LE-64BIT-NEXT: stxvd2x 0, 7, 6
; LE-64BIT-NEXT: stxvd2x 1, 0, 7
-; LE-64BIT-NEXT: li 6, 7
-; LE-64BIT-NEXT: ldux 7, 4, 7
-; LE-64BIT-NEXT: ld 8, 16(4)
-; LE-64BIT-NEXT: nand 6, 3, 6
+; LE-64BIT-NEXT: xori 8, 3, 63
+; LE-64BIT-NEXT: ldux 6, 4, 7
+; LE-64BIT-NEXT: ld 7, 16(4)
; LE-64BIT-NEXT: ld 9, 8(4)
-; LE-64BIT-NEXT: clrlwi 3, 3, 29
; LE-64BIT-NEXT: ld 4, 24(4)
-; LE-64BIT-NEXT: clrlwi 6, 6, 26
+; LE-64BIT-NEXT: srd 6, 6, 3
+; LE-64BIT-NEXT: sldi 11, 7, 1
+; LE-64BIT-NEXT: srd 10, 9, 3
; LE-64BIT-NEXT: srd 7, 7, 3
-; LE-64BIT-NEXT: sldi 10, 8, 1
-; LE-64BIT-NEXT: srd 11, 9, 3
-; LE-64BIT-NEXT: srd 8, 8, 3
-; LE-64BIT-NEXT: sld 6, 10, 6
+; LE-64BIT-NEXT: sld 8, 11, 8
+; LE-64BIT-NEXT: or 8, 10, 8
; LE-64BIT-NEXT: subfic 10, 3, 64
; LE-64BIT-NEXT: srd 3, 4, 3
-; LE-64BIT-NEXT: or 6, 11, 6
; LE-64BIT-NEXT: sld 11, 4, 10
; LE-64BIT-NEXT: sld 9, 9, 10
; LE-64BIT-NEXT: std 3, 24(5)
-; LE-64BIT-NEXT: or 7, 9, 7
-; LE-64BIT-NEXT: or 3, 11, 8
-; LE-64BIT-NEXT: std 6, 8(5)
-; LE-64BIT-NEXT: std 7, 0(5)
+; LE-64BIT-NEXT: std 8, 8(5)
+; LE-64BIT-NEXT: or 6, 9, 6
+; LE-64BIT-NEXT: or 3, 11, 7
+; LE-64BIT-NEXT: std 6, 0(5)
; LE-64BIT-NEXT: std 3, 16(5)
; LE-64BIT-NEXT: blr
;
@@ -485,44 +471,39 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; BE-NEXT: ld 8, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: li 10, 0
-; BE-NEXT: addi 11, 1, -32
-; BE-NEXT: std 3, 56(9)
-; BE-NEXT: rlwinm 3, 4, 29, 27, 31
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -32
+; BE-NEXT: std 9, -40(1)
+; BE-NEXT: std 9, -48(1)
+; BE-NEXT: std 9, -56(1)
+; BE-NEXT: std 9, -64(1)
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: rlwinm 3, 4, 29, 27, 28
; BE-NEXT: neg 3, 3
-; BE-NEXT: std 10, 24(9)
-; BE-NEXT: std 10, 16(9)
-; BE-NEXT: std 10, 8(9)
-; BE-NEXT: std 10, -64(1)
-; BE-NEXT: std 8, 48(9)
-; BE-NEXT: std 7, 40(9)
-; BE-NEXT: std 6, 32(9)
+; BE-NEXT: std 8, -16(1)
+; BE-NEXT: std 7, -24(1)
+; BE-NEXT: std 6, -32(1)
; BE-NEXT: extsw 3, 3
-; BE-NEXT: ldux 3, 11, 3
-; BE-NEXT: li 6, 7
-; BE-NEXT: nand 6, 4, 6
-; BE-NEXT: clrlwi 4, 4, 29
-; BE-NEXT: clrlwi 6, 6, 26
-; BE-NEXT: ld 7, 8(11)
-; BE-NEXT: ld 8, 16(11)
-; BE-NEXT: ld 9, 24(11)
-; BE-NEXT: subfic 10, 4, 64
-; BE-NEXT: sldi 11, 7, 1
-; BE-NEXT: srd 7, 7, 4
-; BE-NEXT: srd 9, 9, 4
-; BE-NEXT: sld 6, 11, 6
-; BE-NEXT: sld 11, 3, 10
-; BE-NEXT: sld 10, 8, 10
-; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: ldux 3, 10, 3
+; BE-NEXT: clrlwi 4, 4, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 6, 8(10)
+; BE-NEXT: ld 7, 24(10)
+; BE-NEXT: ld 8, 16(10)
+; BE-NEXT: sld 10, 3, 9
; BE-NEXT: srd 3, 3, 4
-; BE-NEXT: or 7, 11, 7
-; BE-NEXT: or 6, 8, 6
-; BE-NEXT: or 8, 10, 9
; BE-NEXT: std 3, 0(5)
-; BE-NEXT: std 8, 24(5)
-; BE-NEXT: std 7, 8(5)
+; BE-NEXT: srd 11, 6, 4
+; BE-NEXT: srd 7, 7, 4
+; BE-NEXT: sld 6, 6, 9
+; BE-NEXT: sld 9, 8, 9
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 7, 9, 7
+; BE-NEXT: or 6, 6, 8
; BE-NEXT: std 6, 16(5)
+; BE-NEXT: std 7, 24(5)
+; BE-NEXT: std 10, 8(5)
; BE-NEXT: blr
;
; LE-32BIT-LABEL: lshr_32bytes:
@@ -546,68 +527,64 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 6, 28(1)
; LE-32BIT-NEXT: stw 6, 24(1)
; LE-32BIT-NEXT: stw 6, 20(1)
-; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 31
+; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 29
; LE-32BIT-NEXT: stw 3, 80(1)
; LE-32BIT-NEXT: addi 3, 1, 52
; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: sub 3, 3, 6
; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: subfic 30, 4, 32
; LE-32BIT-NEXT: stw 0, 76(1)
; LE-32BIT-NEXT: stw 12, 72(1)
+; LE-32BIT-NEXT: xori 12, 4, 31
; LE-32BIT-NEXT: stw 11, 68(1)
; LE-32BIT-NEXT: stw 10, 64(1)
; LE-32BIT-NEXT: stw 9, 60(1)
-; LE-32BIT-NEXT: li 9, 7
; LE-32BIT-NEXT: stw 8, 56(1)
-; LE-32BIT-NEXT: nand 9, 4, 9
; LE-32BIT-NEXT: stw 7, 52(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
-; LE-32BIT-NEXT: lwz 6, 4(3)
-; LE-32BIT-NEXT: subfic 30, 4, 32
-; LE-32BIT-NEXT: lwz 7, 8(3)
-; LE-32BIT-NEXT: clrlwi 9, 9, 27
-; LE-32BIT-NEXT: lwz 8, 12(3)
-; LE-32BIT-NEXT: slwi 29, 6, 1
-; LE-32BIT-NEXT: lwz 10, 16(3)
-; LE-32BIT-NEXT: srw 28, 7, 4
-; LE-32BIT-NEXT: lwz 11, 20(3)
-; LE-32BIT-NEXT: slwi 27, 8, 1
-; LE-32BIT-NEXT: lwz 12, 24(3)
+; LE-32BIT-NEXT: lwz 6, 8(3)
+; LE-32BIT-NEXT: lwz 7, 4(3)
+; LE-32BIT-NEXT: lwz 8, 0(3)
+; LE-32BIT-NEXT: srw 29, 6, 4
+; LE-32BIT-NEXT: lwz 9, 12(3)
+; LE-32BIT-NEXT: slw 6, 6, 30
+; LE-32BIT-NEXT: lwz 10, 20(3)
+; LE-32BIT-NEXT: slw 28, 8, 30
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: srw 27, 9, 4
+; LE-32BIT-NEXT: lwz 0, 28(3)
; LE-32BIT-NEXT: srw 26, 10, 4
-; LE-32BIT-NEXT: lwz 0, 0(3)
-; LE-32BIT-NEXT: srw 6, 6, 4
-; LE-32BIT-NEXT: lwz 3, 28(3)
-; LE-32BIT-NEXT: srw 25, 12, 4
-; LE-32BIT-NEXT: slw 12, 12, 30
-; LE-32BIT-NEXT: slw 7, 7, 30
-; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: lwz 3, 24(3)
+; LE-32BIT-NEXT: slw 25, 11, 30
+; LE-32BIT-NEXT: slw 9, 9, 30
; LE-32BIT-NEXT: slw 10, 10, 30
-; LE-32BIT-NEXT: slw 30, 0, 30
-; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 30, 3, 30
+; LE-32BIT-NEXT: srw 3, 3, 4
; LE-32BIT-NEXT: srw 0, 0, 4
-; LE-32BIT-NEXT: srw 4, 11, 4
-; LE-32BIT-NEXT: or 3, 12, 3
+; LE-32BIT-NEXT: or 3, 10, 3
+; LE-32BIT-NEXT: srw 11, 11, 4
+; LE-32BIT-NEXT: stw 3, 24(5)
+; LE-32BIT-NEXT: or 3, 30, 0
; LE-32BIT-NEXT: stw 3, 28(5)
-; LE-32BIT-NEXT: or 3, 10, 4
-; LE-32BIT-NEXT: slwi 11, 11, 1
+; LE-32BIT-NEXT: or 3, 9, 11
+; LE-32BIT-NEXT: stw 3, 16(5)
+; LE-32BIT-NEXT: or 3, 25, 26
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: srw 4, 7, 4
+; LE-32BIT-NEXT: slwi 7, 7, 1
; LE-32BIT-NEXT: stw 3, 20(5)
-; LE-32BIT-NEXT: or 3, 7, 8
-; LE-32BIT-NEXT: slw 29, 29, 9
-; LE-32BIT-NEXT: slw 27, 27, 9
-; LE-32BIT-NEXT: slw 9, 11, 9
+; LE-32BIT-NEXT: or 3, 6, 27
+; LE-32BIT-NEXT: slw 7, 7, 12
; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: or 3, 30, 6
+; LE-32BIT-NEXT: or 3, 28, 4
; LE-32BIT-NEXT: stw 3, 4(5)
-; LE-32BIT-NEXT: or 3, 25, 9
-; LE-32BIT-NEXT: stw 3, 24(5)
-; LE-32BIT-NEXT: or 3, 26, 27
-; LE-32BIT-NEXT: stw 3, 16(5)
-; LE-32BIT-NEXT: or 3, 28, 29
-; LE-32BIT-NEXT: stw 0, 0(5)
+; LE-32BIT-NEXT: or 3, 29, 7
+; LE-32BIT-NEXT: stw 8, 0(5)
; LE-32BIT-NEXT: stw 3, 8(5)
; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
@@ -635,37 +612,33 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-64BIT-NEXT: lxvd2x 0, 3, 6
; LE-64BIT-NEXT: stxvd2x 2, 7, 6
; LE-64BIT-NEXT: li 6, 48
-; LE-64BIT-NEXT: rlwinm 3, 4, 29, 27, 31
+; LE-64BIT-NEXT: rlwinm 3, 4, 29, 27, 28
+; LE-64BIT-NEXT: clrlwi 4, 4, 26
; LE-64BIT-NEXT: neg 3, 3
; LE-64BIT-NEXT: stxvd2x 0, 7, 6
; LE-64BIT-NEXT: li 6, 32
; LE-64BIT-NEXT: extsw 3, 3
; LE-64BIT-NEXT: stxvd2x 1, 7, 6
; LE-64BIT-NEXT: stxvd2x 2, 0, 7
-; LE-64BIT-NEXT: li 6, 7
+; LE-64BIT-NEXT: subfic 6, 4, 64
; LE-64BIT-NEXT: ldux 3, 8, 3
-; LE-64BIT-NEXT: ld 7, 8(8)
-; LE-64BIT-NEXT: nand 6, 4, 6
-; LE-64BIT-NEXT: ld 9, 16(8)
-; LE-64BIT-NEXT: clrlwi 4, 4, 29
-; LE-64BIT-NEXT: ld 8, 24(8)
-; LE-64BIT-NEXT: clrlwi 6, 6, 26
-; LE-64BIT-NEXT: rldicl 10, 7, 63, 1
-; LE-64BIT-NEXT: sld 8, 8, 4
+; LE-64BIT-NEXT: ld 7, 16(8)
+; LE-64BIT-NEXT: ld 9, 24(8)
+; LE-64BIT-NEXT: ld 8, 8(8)
+; LE-64BIT-NEXT: srd 10, 7, 6
+; LE-64BIT-NEXT: sld 9, 9, 4
; LE-64BIT-NEXT: sld 7, 7, 4
-; LE-64BIT-NEXT: srd 6, 10, 6
-; LE-64BIT-NEXT: sld 10, 9, 4
-; LE-64BIT-NEXT: or 6, 10, 6
-; LE-64BIT-NEXT: subfic 10, 4, 64
-; LE-64BIT-NEXT: srd 9, 9, 10
-; LE-64BIT-NEXT: srd 10, 3, 10
+; LE-64BIT-NEXT: or 9, 9, 10
+; LE-64BIT-NEXT: srd 10, 8, 6
+; LE-64BIT-NEXT: srd 6, 3, 6
+; LE-64BIT-NEXT: sld 8, 8, 4
; LE-64BIT-NEXT: sld 3, 3, 4
-; LE-64BIT-NEXT: std 6, 16(5)
-; LE-64BIT-NEXT: or 7, 7, 10
+; LE-64BIT-NEXT: or 6, 8, 6
; LE-64BIT-NEXT: std 3, 0(5)
-; LE-64BIT-NEXT: or 3, 8, 9
-; LE-64BIT-NEXT: std 7, 8(5)
-; LE-64BIT-NEXT: std 3, 24(5)
+; LE-64BIT-NEXT: or 3, 7, 10
+; LE-64BIT-NEXT: std 9, 24(5)
+; LE-64BIT-NEXT: std 6, 8(5)
+; LE-64BIT-NEXT: std 3, 16(5)
; LE-64BIT-NEXT: blr
;
; BE-LABEL: shl_32bytes:
@@ -675,41 +648,37 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; BE-NEXT: ld 8, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: li 10, 0
-; BE-NEXT: std 10, 56(9)
-; BE-NEXT: std 10, 48(9)
-; BE-NEXT: std 10, 40(9)
-; BE-NEXT: std 10, 32(9)
-; BE-NEXT: std 3, 24(9)
-; BE-NEXT: std 8, 16(9)
-; BE-NEXT: std 7, 8(9)
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -64
+; BE-NEXT: std 9, -8(1)
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 9, -24(1)
+; BE-NEXT: std 9, -32(1)
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 8, -48(1)
+; BE-NEXT: std 7, -56(1)
; BE-NEXT: std 6, -64(1)
-; BE-NEXT: rlwinm 3, 4, 29, 27, 31
-; BE-NEXT: ldux 6, 3, 9
-; BE-NEXT: li 7, 7
-; BE-NEXT: nand 7, 4, 7
-; BE-NEXT: clrlwi 4, 4, 29
-; BE-NEXT: clrlwi 7, 7, 26
-; BE-NEXT: ld 8, 16(3)
-; BE-NEXT: ld 9, 8(3)
+; BE-NEXT: rlwinm 3, 4, 29, 27, 28
+; BE-NEXT: ldux 6, 3, 10
+; BE-NEXT: clrlwi 4, 4, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 16(3)
+; BE-NEXT: ld 8, 8(3)
; BE-NEXT: ld 3, 24(3)
-; BE-NEXT: subfic 10, 4, 64
; BE-NEXT: sld 6, 6, 4
-; BE-NEXT: rldicl 11, 8, 63, 1
-; BE-NEXT: sld 8, 8, 4
-; BE-NEXT: srd 7, 11, 7
-; BE-NEXT: srd 11, 9, 10
-; BE-NEXT: sld 9, 9, 4
-; BE-NEXT: srd 10, 3, 10
+; BE-NEXT: srd 10, 7, 9
+; BE-NEXT: sld 11, 8, 4
+; BE-NEXT: srd 8, 8, 9
+; BE-NEXT: srd 9, 3, 9
+; BE-NEXT: sld 7, 7, 4
; BE-NEXT: sld 3, 3, 4
-; BE-NEXT: or 6, 6, 11
-; BE-NEXT: or 7, 9, 7
-; BE-NEXT: or 8, 8, 10
+; BE-NEXT: or 10, 11, 10
+; BE-NEXT: or 6, 6, 8
+; BE-NEXT: or 7, 7, 9
; BE-NEXT: std 3, 24(5)
-; BE-NEXT: std 8, 16(5)
+; BE-NEXT: std 7, 16(5)
; BE-NEXT: std 6, 0(5)
-; BE-NEXT: std 7, 8(5)
+; BE-NEXT: std 10, 8(5)
; BE-NEXT: blr
;
; LE-32BIT-LABEL: shl_32bytes:
@@ -739,61 +708,55 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 6, 60(1)
; LE-32BIT-NEXT: stw 6, 56(1)
; LE-32BIT-NEXT: stw 6, 52(1)
-; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 31
+; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 29
; LE-32BIT-NEXT: stw 3, 48(1)
; LE-32BIT-NEXT: addi 3, 1, 20
; LE-32BIT-NEXT: stw 0, 44(1)
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
; LE-32BIT-NEXT: stw 12, 40(1)
+; LE-32BIT-NEXT: subfic 12, 4, 32
; LE-32BIT-NEXT: stw 11, 36(1)
; LE-32BIT-NEXT: stw 10, 32(1)
; LE-32BIT-NEXT: stw 9, 28(1)
; LE-32BIT-NEXT: stw 8, 24(1)
-; LE-32BIT-NEXT: li 8, 7
; LE-32BIT-NEXT: stw 7, 20(1)
-; LE-32BIT-NEXT: nand 8, 4, 8
; LE-32BIT-NEXT: lwzux 3, 6, 3
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
-; LE-32BIT-NEXT: subfic 0, 4, 32
-; LE-32BIT-NEXT: clrlwi 8, 8, 27
; LE-32BIT-NEXT: lwz 7, 8(6)
; LE-32BIT-NEXT: slw 3, 3, 4
-; LE-32BIT-NEXT: lwz 9, 4(6)
-; LE-32BIT-NEXT: lwz 10, 16(6)
-; LE-32BIT-NEXT: srwi 29, 7, 1
-; LE-32BIT-NEXT: lwz 11, 12(6)
-; LE-32BIT-NEXT: slw 28, 9, 4
-; LE-32BIT-NEXT: lwz 12, 24(6)
-; LE-32BIT-NEXT: srwi 27, 10, 1
-; LE-32BIT-NEXT: lwz 30, 20(6)
-; LE-32BIT-NEXT: slw 26, 11, 4
+; LE-32BIT-NEXT: lwz 8, 4(6)
+; LE-32BIT-NEXT: lwz 9, 16(6)
+; LE-32BIT-NEXT: srw 30, 7, 12
+; LE-32BIT-NEXT: lwz 10, 12(6)
+; LE-32BIT-NEXT: slw 29, 8, 4
+; LE-32BIT-NEXT: lwz 11, 24(6)
+; LE-32BIT-NEXT: srw 8, 8, 12
+; LE-32BIT-NEXT: lwz 0, 20(6)
+; LE-32BIT-NEXT: srw 28, 9, 12
; LE-32BIT-NEXT: lwz 6, 28(6)
-; LE-32BIT-NEXT: srw 9, 9, 0
-; LE-32BIT-NEXT: slw 25, 30, 4
-; LE-32BIT-NEXT: srw 11, 11, 0
+; LE-32BIT-NEXT: slw 27, 10, 4
+; LE-32BIT-NEXT: srw 10, 10, 12
; LE-32BIT-NEXT: slw 7, 7, 4
-; LE-32BIT-NEXT: srw 30, 30, 0
-; LE-32BIT-NEXT: slw 10, 10, 4
-; LE-32BIT-NEXT: srw 0, 6, 0
-; LE-32BIT-NEXT: slw 6, 6, 4
-; LE-32BIT-NEXT: slw 4, 12, 4
-; LE-32BIT-NEXT: srwi 12, 12, 1
-; LE-32BIT-NEXT: srw 29, 29, 8
-; LE-32BIT-NEXT: srw 27, 27, 8
-; LE-32BIT-NEXT: srw 8, 12, 8
-; LE-32BIT-NEXT: or 3, 3, 9
-; LE-32BIT-NEXT: or 4, 4, 0
-; LE-32BIT-NEXT: stw 3, 0(5)
-; LE-32BIT-NEXT: or 3, 25, 8
+; LE-32BIT-NEXT: srw 26, 11, 12
+; LE-32BIT-NEXT: slw 25, 0, 4
+; LE-32BIT-NEXT: srw 0, 0, 12
+; LE-32BIT-NEXT: slw 9, 9, 4
+; LE-32BIT-NEXT: srw 12, 6, 12
+; LE-32BIT-NEXT: slw 11, 11, 4
+; LE-32BIT-NEXT: slw 4, 6, 4
+; LE-32BIT-NEXT: stw 4, 28(5)
+; LE-32BIT-NEXT: or 4, 11, 12
; LE-32BIT-NEXT: stw 4, 24(5)
-; LE-32BIT-NEXT: or 4, 10, 30
-; LE-32BIT-NEXT: stw 3, 20(5)
-; LE-32BIT-NEXT: or 3, 26, 27
+; LE-32BIT-NEXT: or 4, 9, 0
; LE-32BIT-NEXT: stw 4, 16(5)
-; LE-32BIT-NEXT: or 4, 7, 11
-; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: or 3, 28, 29
-; LE-32BIT-NEXT: stw 6, 28(5)
+; LE-32BIT-NEXT: or 4, 25, 26
+; LE-32BIT-NEXT: stw 4, 20(5)
+; LE-32BIT-NEXT: or 4, 7, 10
+; LE-32BIT-NEXT: or 3, 3, 8
; LE-32BIT-NEXT: stw 4, 8(5)
+; LE-32BIT-NEXT: or 4, 27, 28
+; LE-32BIT-NEXT: stw 3, 0(5)
+; LE-32BIT-NEXT: or 3, 29, 30
+; LE-32BIT-NEXT: stw 4, 12(5)
; LE-32BIT-NEXT: stw 3, 4(5)
; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
@@ -812,91 +775,84 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-64BIT-LABEL: ashr_32bytes:
; LE-64BIT: # %bb.0:
-; LE-64BIT-NEXT: lxvd2x 0, 0, 3
; LE-64BIT-NEXT: ld 6, 24(3)
+; LE-64BIT-NEXT: lxvd2x 0, 0, 3
; LE-64BIT-NEXT: lwz 4, 0(4)
; LE-64BIT-NEXT: addi 7, 1, -64
; LE-64BIT-NEXT: ld 3, 16(3)
; LE-64BIT-NEXT: sradi 8, 6, 63
-; LE-64BIT-NEXT: rlwinm 9, 4, 29, 27, 31
-; LE-64BIT-NEXT: std 6, 24(7)
-; LE-64BIT-NEXT: std 3, 16(7)
-; LE-64BIT-NEXT: li 3, 7
-; LE-64BIT-NEXT: std 8, 56(7)
-; LE-64BIT-NEXT: std 8, 48(7)
-; LE-64BIT-NEXT: std 8, 40(7)
-; LE-64BIT-NEXT: std 8, 32(7)
+; LE-64BIT-NEXT: rlwinm 9, 4, 29, 27, 28
+; LE-64BIT-NEXT: clrlwi 4, 4, 26
; LE-64BIT-NEXT: stxvd2x 0, 0, 7
-; LE-64BIT-NEXT: nand 3, 4, 3
-; LE-64BIT-NEXT: clrlwi 4, 4, 29
-; LE-64BIT-NEXT: ldux 6, 9, 7
-; LE-64BIT-NEXT: ld 7, 16(9)
+; LE-64BIT-NEXT: std 6, -40(1)
+; LE-64BIT-NEXT: std 3, -48(1)
+; LE-64BIT-NEXT: std 8, -8(1)
+; LE-64BIT-NEXT: std 8, -16(1)
+; LE-64BIT-NEXT: std 8, -24(1)
+; LE-64BIT-NEXT: std 8, -32(1)
+; LE-64BIT-NEXT: ldux 3, 9, 7
+; LE-64BIT-NEXT: xori 7, 4, 63
+; LE-64BIT-NEXT: ld 6, 16(9)
; LE-64BIT-NEXT: ld 8, 8(9)
-; LE-64BIT-NEXT: clrlwi 3, 3, 26
; LE-64BIT-NEXT: ld 9, 24(9)
+; LE-64BIT-NEXT: srd 3, 3, 4
+; LE-64BIT-NEXT: sldi 11, 6, 1
+; LE-64BIT-NEXT: srd 10, 8, 4
; LE-64BIT-NEXT: srd 6, 6, 4
-; LE-64BIT-NEXT: sldi 10, 7, 1
-; LE-64BIT-NEXT: srd 11, 8, 4
-; LE-64BIT-NEXT: srd 7, 7, 4
-; LE-64BIT-NEXT: sld 3, 10, 3
+; LE-64BIT-NEXT: sld 7, 11, 7
+; LE-64BIT-NEXT: or 7, 10, 7
; LE-64BIT-NEXT: subfic 10, 4, 64
; LE-64BIT-NEXT: srad 4, 9, 4
-; LE-64BIT-NEXT: or 3, 11, 3
-; LE-64BIT-NEXT: sld 11, 9, 10
; LE-64BIT-NEXT: sld 8, 8, 10
+; LE-64BIT-NEXT: sld 11, 9, 10
; LE-64BIT-NEXT: std 4, 24(5)
-; LE-64BIT-NEXT: or 6, 8, 6
-; LE-64BIT-NEXT: or 4, 11, 7
-; LE-64BIT-NEXT: std 3, 8(5)
-; LE-64BIT-NEXT: std 6, 0(5)
-; LE-64BIT-NEXT: std 4, 16(5)
+; LE-64BIT-NEXT: std 7, 8(5)
+; LE-64BIT-NEXT: or 3, 8, 3
+; LE-64BIT-NEXT: std 3, 0(5)
+; LE-64BIT-NEXT: or 3, 11, 6
+; LE-64BIT-NEXT: std 3, 16(5)
; LE-64BIT-NEXT: blr
;
; BE-LABEL: ashr_32bytes:
; BE: # %bb.0:
-; BE-NEXT: ld 6, 0(3)
-; BE-NEXT: ld 7, 8(3)
-; BE-NEXT: ld 8, 16(3)
+; BE-NEXT: ld 7, 0(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 9, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: addi 10, 1, -32
-; BE-NEXT: std 3, 56(9)
-; BE-NEXT: std 6, 32(9)
-; BE-NEXT: sradi 3, 6, 63
-; BE-NEXT: rlwinm 6, 4, 29, 27, 31
-; BE-NEXT: std 3, 24(9)
-; BE-NEXT: std 3, 16(9)
-; BE-NEXT: std 3, 8(9)
+; BE-NEXT: addi 6, 1, -32
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: std 7, -32(1)
+; BE-NEXT: sradi 3, 7, 63
+; BE-NEXT: rlwinm 7, 4, 29, 27, 28
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 3, -48(1)
+; BE-NEXT: std 3, -56(1)
; BE-NEXT: std 3, -64(1)
-; BE-NEXT: neg 3, 6
-; BE-NEXT: std 8, 48(9)
-; BE-NEXT: std 7, 40(9)
+; BE-NEXT: neg 3, 7
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 8, -24(1)
; BE-NEXT: extsw 3, 3
-; BE-NEXT: ldux 3, 10, 3
-; BE-NEXT: li 6, 7
-; BE-NEXT: nand 6, 4, 6
-; BE-NEXT: clrlwi 4, 4, 29
-; BE-NEXT: clrlwi 6, 6, 26
-; BE-NEXT: ld 7, 8(10)
-; BE-NEXT: ld 8, 16(10)
-; BE-NEXT: ld 9, 24(10)
-; BE-NEXT: subfic 10, 4, 64
-; BE-NEXT: sldi 11, 7, 1
-; BE-NEXT: srd 7, 7, 4
-; BE-NEXT: srd 9, 9, 4
-; BE-NEXT: sld 6, 11, 6
-; BE-NEXT: sld 11, 3, 10
-; BE-NEXT: sld 10, 8, 10
-; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: ldux 3, 6, 3
+; BE-NEXT: clrlwi 4, 4, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 8(6)
+; BE-NEXT: ld 8, 24(6)
+; BE-NEXT: ld 6, 16(6)
+; BE-NEXT: sld 10, 3, 9
; BE-NEXT: srad 3, 3, 4
-; BE-NEXT: or 7, 11, 7
-; BE-NEXT: or 6, 8, 6
-; BE-NEXT: or 8, 10, 9
; BE-NEXT: std 3, 0(5)
-; BE-NEXT: std 8, 24(5)
-; BE-NEXT: std 7, 8(5)
+; BE-NEXT: srd 11, 7, 4
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: sld 7, 7, 9
+; BE-NEXT: sld 9, 6, 9
+; BE-NEXT: srd 6, 6, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 8, 9, 8
+; BE-NEXT: or 6, 7, 6
; BE-NEXT: std 6, 16(5)
+; BE-NEXT: std 8, 24(5)
+; BE-NEXT: std 10, 8(5)
; BE-NEXT: blr
;
; LE-32BIT-LABEL: ashr_32bytes:
@@ -915,73 +871,69 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 3, 80(1)
; LE-32BIT-NEXT: srawi 3, 7, 31
; LE-32BIT-NEXT: stw 7, 52(1)
-; LE-32BIT-NEXT: rlwinm 7, 4, 29, 27, 31
+; LE-32BIT-NEXT: rlwinm 7, 4, 29, 27, 29
; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: subfic 30, 4, 32
; LE-32BIT-NEXT: stw 0, 76(1)
; LE-32BIT-NEXT: stw 12, 72(1)
+; LE-32BIT-NEXT: xori 12, 4, 31
; LE-32BIT-NEXT: stw 11, 68(1)
; LE-32BIT-NEXT: stw 10, 64(1)
; LE-32BIT-NEXT: stw 9, 60(1)
-; LE-32BIT-NEXT: li 9, 7
; LE-32BIT-NEXT: stw 8, 56(1)
-; LE-32BIT-NEXT: nand 9, 4, 9
; LE-32BIT-NEXT: stw 3, 48(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
; LE-32BIT-NEXT: stw 3, 44(1)
-; LE-32BIT-NEXT: subfic 30, 4, 32
; LE-32BIT-NEXT: stw 3, 40(1)
-; LE-32BIT-NEXT: clrlwi 9, 9, 27
; LE-32BIT-NEXT: stw 3, 36(1)
; LE-32BIT-NEXT: stw 3, 32(1)
; LE-32BIT-NEXT: stw 3, 28(1)
; LE-32BIT-NEXT: stw 3, 24(1)
; LE-32BIT-NEXT: stw 3, 20(1)
; LE-32BIT-NEXT: sub 3, 6, 7
-; LE-32BIT-NEXT: lwz 6, 4(3)
-; LE-32BIT-NEXT: lwz 7, 8(3)
-; LE-32BIT-NEXT: lwz 8, 12(3)
-; LE-32BIT-NEXT: slwi 29, 6, 1
-; LE-32BIT-NEXT: lwz 10, 16(3)
-; LE-32BIT-NEXT: srw 28, 7, 4
-; LE-32BIT-NEXT: lwz 11, 20(3)
-; LE-32BIT-NEXT: slwi 27, 8, 1
-; LE-32BIT-NEXT: lwz 12, 24(3)
+; LE-32BIT-NEXT: lwz 6, 8(3)
+; LE-32BIT-NEXT: lwz 7, 4(3)
+; LE-32BIT-NEXT: lwz 8, 0(3)
+; LE-32BIT-NEXT: srw 29, 6, 4
+; LE-32BIT-NEXT: lwz 9, 12(3)
+; LE-32BIT-NEXT: slw 6, 6, 30
+; LE-32BIT-NEXT: lwz 10, 20(3)
+; LE-32BIT-NEXT: slw 28, 8, 30
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: srw 27, 9, 4
+; LE-32BIT-NEXT: lwz 0, 28(3)
; LE-32BIT-NEXT: srw 26, 10, 4
-; LE-32BIT-NEXT: lwz 0, 0(3)
-; LE-32BIT-NEXT: srw 6, 6, 4
-; LE-32BIT-NEXT: lwz 3, 28(3)
-; LE-32BIT-NEXT: srw 25, 12, 4
-; LE-32BIT-NEXT: slw 12, 12, 30
-; LE-32BIT-NEXT: slw 7, 7, 30
-; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: lwz 3, 24(3)
+; LE-32BIT-NEXT: slw 25, 11, 30
+; LE-32BIT-NEXT: slw 9, 9, 30
; LE-32BIT-NEXT: slw 10, 10, 30
-; LE-32BIT-NEXT: slw 30, 0, 30
-; LE-32BIT-NEXT: srw 8, 8, 4
-; LE-32BIT-NEXT: sraw 0, 0, 4
-; LE-32BIT-NEXT: srw 4, 11, 4
-; LE-32BIT-NEXT: or 3, 12, 3
+; LE-32BIT-NEXT: slw 30, 3, 30
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: srw 0, 0, 4
+; LE-32BIT-NEXT: or 3, 10, 3
+; LE-32BIT-NEXT: srw 11, 11, 4
+; LE-32BIT-NEXT: stw 3, 24(5)
+; LE-32BIT-NEXT: or 3, 30, 0
; LE-32BIT-NEXT: stw 3, 28(5)
-; LE-32BIT-NEXT: or 3, 10, 4
-; LE-32BIT-NEXT: slwi 11, 11, 1
+; LE-32BIT-NEXT: or 3, 9, 11
+; LE-32BIT-NEXT: stw 3, 16(5)
+; LE-32BIT-NEXT: or 3, 25, 26
+; LE-32BIT-NEXT: sraw 8, 8, 4
+; LE-32BIT-NEXT: srw 4, 7, 4
+; LE-32BIT-NEXT: slwi 7, 7, 1
; LE-32BIT-NEXT: stw 3, 20(5)
-; LE-32BIT-NEXT: or 3, 7, 8
-; LE-32BIT-NEXT: slw 29, 29, 9
-; LE-32BIT-NEXT: slw 27, 27, 9
-; LE-32BIT-NEXT: slw 9, 11, 9
+; LE-32BIT-NEXT: or 3, 6, 27
+; LE-32BIT-NEXT: slw 7, 7, 12
; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: or 3, 30, 6
+; LE-32BIT-NEXT: or 3, 28, 4
; LE-32BIT-NEXT: stw 3, 4(5)
-; LE-32BIT-NEXT: or 3, 25, 9
-; LE-32BIT-NEXT: stw 3, 24(5)
-; LE-32BIT-NEXT: or 3, 26, 27
-; LE-32BIT-NEXT: stw 3, 16(5)
-; LE-32BIT-NEXT: or 3, 28, 29
-; LE-32BIT-NEXT: stw 0, 0(5)
+; LE-32BIT-NEXT: or 3, 29, 7
+; LE-32BIT-NEXT: stw 8, 0(5)
; LE-32BIT-NEXT: stw 3, 8(5)
; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 1c303de55c95d..54106bde42527 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -177,7 +177,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $156, %esp
+; X86-NEXT: subl $152, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -194,48 +194,47 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-NEXT: sbbl %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: sarl $31, %edx
; X86-NEXT: xorl %edx, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: xorl %edx, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: xorl %edx, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: xorl %edx, %edi
; X86-NEXT: subl %edx, %edi
; X86-NEXT: sbbl %edx, %ebp
-; X86-NEXT: sbbl %edx, %ebx
; X86-NEXT: sbbl %edx, %esi
+; X86-NEXT: sbbl %edx, %ebx
; X86-NEXT: xorl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: orl %esi, %eax
+; X86-NEXT: orl %ebx, %eax
; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: orl %esi, %ecx
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: sete %cl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: orl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: bsrl %esi, %edx
+; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %ebx, %ecx
+; X86-NEXT: bsrl %esi, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
-; X86-NEXT: testl %esi, %esi
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: bsrl %ebp, %edx
; X86-NEXT: xorl $31, %edx
@@ -247,12 +246,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: testl %ebp, %ebp
; X86-NEXT: cmovnel %edx, %edi
; X86-NEXT: addl $64, %edi
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %edx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %edx
; X86-NEXT: cmovnel %ecx, %edi
-; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
@@ -261,7 +260,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: addl $32, %ecx
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NEXT: bsrl %eax, %esi
; X86-NEXT: xorl $31, %esi
; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -270,299 +269,293 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: testl %eax, %eax
; X86-NEXT: cmovnel %esi, %edx
; X86-NEXT: addl $64, %edx
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %ebx, %ebp
; X86-NEXT: cmovnel %ecx, %edx
-; X86-NEXT: xorl %ebx, %ebx
+; X86-NEXT: xorl %esi, %esi
; X86-NEXT: subl %edx, %edi
+; X86-NEXT: movl $0, %ebp
+; X86-NEXT: sbbl %ebp, %ebp
; X86-NEXT: movl $0, %edx
; X86-NEXT: sbbl %edx, %edx
; X86-NEXT: movl $0, %eax
; X86-NEXT: sbbl %eax, %eax
-; X86-NEXT: movl $0, %esi
-; X86-NEXT: sbbl %esi, %esi
; X86-NEXT: movl $127, %ecx
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: cmpl %edi, %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ebp, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl $0, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %ecx
-; X86-NEXT: movl $0, %ecx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: setb %cl
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-NEXT: cmovnel %ebx, %edx
-; X86-NEXT: cmovnel %ebx, %ebp
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: cmovnel %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: cmovnel %esi, %edx
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: cmovnel %esi, %ecx
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: jne .LBB4_1
+; X86-NEXT: # %bb.8: # %_udiv-special-cases
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: cmovnel %ebx, %eax
-; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: jne .LBB4_8
-; X86-NEXT: # %bb.1: # %_udiv-special-cases
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: xorl $127, %edi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorl $127, %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %edi, %ecx
-; X86-NEXT: je .LBB4_8
-; X86-NEXT: # %bb.2: # %udiv-bb1
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: je .LBB4_9
+; X86-NEXT: # %bb.5: # %udiv-bb1
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: xorb $127, %al
-; X86-NEXT: movb %al, %ch
-; X86-NEXT: andb $7, %ch
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $15, %al
+; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %edi
-; X86-NEXT: movl 148(%esp,%edi), %edx
-; X86-NEXT: movl 152(%esp,%edi), %esi
-; X86-NEXT: movb %ch, %cl
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 144(%esp,%eax), %edx
+; X86-NEXT: movl 148(%esp,%eax), %esi
; X86-NEXT: shldl %cl, %edx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: notb %cl
-; X86-NEXT: movl 144(%esp,%edi), %eax
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: shrl %ebp
-; X86-NEXT: shrl %cl, %ebp
-; X86-NEXT: orl %edx, %ebp
-; X86-NEXT: movl 140(%esp,%edi), %edx
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: jae .LBB4_3
+; X86-NEXT: movl 136(%esp,%eax), %esi
+; X86-NEXT: movl 140(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %edx
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $0, %eax
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: jae .LBB4_2
; X86-NEXT: # %bb.6:
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: xorl %ebx, %ebx
+; X86-NEXT: movl %edi, %ecx
; X86-NEXT: jmp .LBB4_7
-; X86-NEXT: .LBB4_3: # %udiv-preheader
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: .LBB4_1:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: jmp .LBB4_9
+; X86-NEXT: .LBB4_2: # %udiv-preheader
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %dl, %ch
-; X86-NEXT: andb $7, %ch
-; X86-NEXT: movb %dl, %cl
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %edx
-; X86-NEXT: movl 104(%esp,%edx), %ebx
-; X86-NEXT: movl 100(%esp,%edx), %edi
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %ebx, %ebp
-; X86-NEXT: movl 92(%esp,%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: movl 100(%esp,%edx), %ebx
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
; X86-NEXT: movl 96(%esp,%edx), %esi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: shrl %cl, %edx
-; X86-NEXT: notb %cl
-; X86-NEXT: addl %edi, %edi
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %ch, %cl
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: shrdl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 88(%esp,%edx), %eax
+; X86-NEXT: movl 92(%esp,%edx), %edx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: shrdl %cl, %esi, %ebp
; X86-NEXT: shrl %cl, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: .p2align 4, 0x90
-; X86-NEXT: .LBB4_4: # %udiv-do-while
+; X86-NEXT: .LBB4_3: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebp, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %ebp
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: shldl $1, %edi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %ebp, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: shldl $1, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl $1, %eax, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: shldl $1, %ecx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %ecx
; X86-NEXT: orl %esi, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %ecx, %ecx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT: cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebp, %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: sbbl %edi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl $1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl $1, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: subl %ecx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %ebp
+; X86-NEXT: subl %ecx, %ebp
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT: sbbl %edi, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: sbbl %edx, %ebp
+; X86-NEXT: sbbl %ebx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: sbbl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: addl $-1, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl $-1, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
; X86-NEXT: adcl $-1, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: adcl $-1, %esi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %edi
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %ebx, %ecx
-; X86-NEXT: orl %edi, %ecx
-; X86-NEXT: jne .LBB4_4
-; X86-NEXT: # %bb.5:
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: jne .LBB4_3
+; X86-NEXT: # %bb.4:
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: movl (%esp), %esi # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: .LBB4_7: # %udiv-loop-exit
-; X86-NEXT: shldl $1, %ebp, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: shldl $1, %eax, %ebp
-; X86-NEXT: orl %ecx, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl $1, %esi, %eax
-; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %eax
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: shldl $1, %ecx, %edx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: orl %ebx, %ecx
; X86-NEXT: addl %esi, %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: .LBB4_8: # %udiv-end
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, %edx
-; X86-NEXT: xorl %ecx, %ebp
-; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: xorl %ecx, %esi
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %ecx, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: .LBB4_9: # %udiv-end
+; X86-NEXT: xorl %ebp, %eax
+; X86-NEXT: xorl %ebp, %edx
+; X86-NEXT: xorl %ebp, %ecx
+; X86-NEXT: xorl %ebp, %esi
+; X86-NEXT: subl %ebp, %esi
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: sbbl %ebp, %ecx
+; X86-NEXT: sbbl %ebp, %edx
+; X86-NEXT: sbbl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %ecx, %ebp
-; X86-NEXT: sbbl %ecx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %esi, (%ecx)
-; X86-NEXT: movl %eax, 4(%ecx)
-; X86-NEXT: movl %ebp, 8(%ecx)
-; X86-NEXT: movl %edx, 12(%ecx)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ebp, %edi
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %esi, (%edi)
+; X86-NEXT: movl %ecx, 4(%edi)
+; X86-NEXT: movl %edx, 8(%edi)
+; X86-NEXT: movl %eax, 12(%edi)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %ebp
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: mull %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ebx, %edx
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %esi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: imull %eax, %ecx
; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: imull %ebp, %edi
+; X86-NEXT: imull %esi, %edi
; X86-NEXT: addl %edx, %edi
; X86-NEXT: addl %ecx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %esi, %ecx
+; X86-NEXT: imull %ebp, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl (%esp), %edx # 4-byte Reload
; X86-NEXT: imull %edx, %esi
; X86-NEXT: mull %edx
; X86-NEXT: addl %edx, %esi
@@ -572,7 +565,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: subl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
@@ -584,7 +577,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl %edx, 4(%eax)
; X86-NEXT: movl %ebx, 8(%eax)
; X86-NEXT: movl %edi, 12(%eax)
-; X86-NEXT: addl $156, %esp
+; X86-NEXT: addl $152, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index fa45afbb634c4..84f35c6485abe 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -177,14 +177,14 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $136, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: subl $132, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl %edi, %ecx
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sete %bl
@@ -195,95 +195,97 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: sete %al
; X86-NEXT: orb %bl, %al
; X86-NEXT: movb %al, (%esp) # 1-byte Spill
-; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: bsrl %edi, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
-; X86-NEXT: testl %edi, %edi
-; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: testl %esi, %esi
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: bsrl %eax, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: bsrl %ebp, %ebp
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: xorl $31, %ebp
; X86-NEXT: addl $32, %ebp
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ebp
; X86-NEXT: addl $64, %ebp
; X86-NEXT: movl %edi, %edx
-; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: orl %esi, %edx
; X86-NEXT: cmovnel %ecx, %ebp
-; X86-NEXT: bsrl %esi, %edx
-; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: bsrl %edi, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
-; X86-NEXT: testl %esi, %esi
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: bsrl %edi, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: bsrl %eax, %esi
; X86-NEXT: xorl $31, %esi
; X86-NEXT: bsrl {{[0-9]+}}(%esp), %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: addl $32, %edx
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: testl %eax, %eax
; X86-NEXT: cmovnel %esi, %edx
; X86-NEXT: addl $64, %edx
-; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: orl %ebx, %esi
; X86-NEXT: cmovnel %ecx, %edx
; X86-NEXT: subl %edx, %ebp
; X86-NEXT: movl $0, %edx
; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
; X86-NEXT: movl $0, %esi
; X86-NEXT: sbbl %esi, %esi
-; X86-NEXT: movl $0, %edi
-; X86-NEXT: sbbl %edi, %edi
; X86-NEXT: movl $127, %ecx
; X86-NEXT: cmpl %ebp, %ecx
; X86-NEXT: movl $0, %ecx
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: sbbl %ebx, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: setb %cl
; X86-NEXT: orb (%esp), %cl # 1-byte Folded Reload
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: xorl $127, %eax
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %eax
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %edx
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
; X86-NEXT: testb %cl, %cl
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: movl $0, %edi
-; X86-NEXT: cmovnel %edi, %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: cmovnel %edi, %esi
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: cmovnel %edx, %esi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: cmovnel %edx, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: cmovnel %edi, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: cmovnel %edi, %ebx
+; X86-NEXT: cmovnel %edx, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: cmovnel %edi, %edx
; X86-NEXT: orb %cl, %al
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: jne .LBB4_7
; X86-NEXT: # %bb.1: # %udiv-bb1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -291,89 +293,78 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: xorb $127, %al
-; X86-NEXT: movb %al, %ch
-; X86-NEXT: andb $7, %ch
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $15, %al
+; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 128(%esp,%eax), %edx
-; X86-NEXT: movl 132(%esp,%eax), %esi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: notb %cl
; X86-NEXT: movl 124(%esp,%eax), %ebp
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: shrl %esi
-; X86-NEXT: shrl %cl, %esi
-; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl 128(%esp,%eax), %edx
+; X86-NEXT: shldl %cl, %ebp, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 116(%esp,%eax), %edx
; X86-NEXT: movl 120(%esp,%eax), %eax
-; X86-NEXT: movb %ch, %cl
; X86-NEXT: shldl %cl, %eax, %ebp
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: shldl %cl, %edx, %ebp
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl $1, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ebx, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $0, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: jae .LBB4_2
; X86-NEXT: # %bb.5:
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: xorl %edi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: jmp .LBB4_6
; X86-NEXT: .LBB4_2: # %udiv-preheader
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movb %al, %ch
-; X86-NEXT: andb $7, %ch
-; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $15, %al
+; X86-NEXT: andb $12, %al
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 84(%esp,%eax), %ebx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 80(%esp,%eax), %esi
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 76(%esp,%eax), %ebp
+; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: shrdl %cl, %esi, %ebp
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 72(%esp,%eax), %ebp
-; X86-NEXT: movl 76(%esp,%eax), %edx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl %cl, %eax
-; X86-NEXT: notb %cl
-; X86-NEXT: addl %esi, %esi
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrl %cl, %ebx
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: shrdl %cl, %edx, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 68(%esp,%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 72(%esp,%eax), %eax
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NEXT: shrdl %cl, %edx, %ebx
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: shrl %cl, %esi
+; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -383,148 +374,145 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $-1, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: adcl $-1, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB4_3: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: shldl $1, %ebp, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: shldl $1, %ebx, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl $1, %esi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %esi
+; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shldl $1, %edx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl $1, %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: orl %ebp, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shldl $1, %edi, %eax
-; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl $1, %edx, %edi
-; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edi
+; X86-NEXT: orl %ebp, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %edx, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebp, %ecx
+; X86-NEXT: sbbl %ebx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload
; X86-NEXT: sarl $31, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %edi
; X86-NEXT: andl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: andl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: sbbl %ebp, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
; X86-NEXT: sbbl %edi, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: sbbl %eax, (%esp) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: adcl $-1, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $-1, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %ebx, %ecx
-; X86-NEXT: movl (%esp), %edi # 4-byte Reload
; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: jne .LBB4_3
; X86-NEXT: # %bb.4:
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: .LBB4_6: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %edi, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %esi, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: shldl $1, %ebp, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, %ebp
-; X86-NEXT: orl %ecx, %ebp
-; X86-NEXT: addl %ebx, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: .LBB4_6: # %udiv-loop-exit
+; X86-NEXT: shldl $1, %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: shldl $1, %ebp, %ebx
; X86-NEXT: orl %eax, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %ebp
+; X86-NEXT: orl %eax, %ebp
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl %edi, %ecx
; X86-NEXT: .LBB4_7: # %udiv-end
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, (%eax)
-; X86-NEXT: movl %ebp, 4(%eax)
-; X86-NEXT: movl %esi, 8(%eax)
-; X86-NEXT: movl %edx, 12(%eax)
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl %ecx, (%esi)
+; X86-NEXT: movl %ebp, 4(%esi)
+; X86-NEXT: movl %ebx, 8(%esi)
+; X86-NEXT: movl %edx, 12(%esi)
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %esi
; X86-NEXT: imull %ebp, %esi
; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %ecx, %ebp
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: addl %esi, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: imull %ebp, %ecx
+; X86-NEXT: addl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: imull %ecx, %ebp
-; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %esi, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: imull %ebp, %edi
; X86-NEXT: addl %edx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: imull %eax, %ebx
; X86-NEXT: addl %edi, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebp, %ebx
-; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %edi, %ebp
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -532,11 +520,11 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; X86-NEXT: adcl %ebx, %edx
; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: sbbl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: sbbl %eax, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -546,7 +534,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl %edi, 4(%eax)
; X86-NEXT: movl %ebx, 8(%eax)
; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: addl $136, %esp
+; X86-NEXT: addl $132, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index fbc363f77ec42..3dbd0213293bb 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -23,28 +23,28 @@ define void @f() nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $160, %esp
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movzbl (%eax), %eax
; X86-NEXT: movzbl (%eax), %ecx
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: divb %cl
-; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: shll $30, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: sarl $30, %ecx
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: xorl %eax, %edx
; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: xorl %eax, %edx
; X86-NEXT: shrdl $1, %eax, %ecx
; X86-NEXT: xorl %ecx, %esi
; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: sbbl %eax, %edi
; X86-NEXT: sbbl %eax, %edx
-; X86-NEXT: andl $3, %edx
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: andl $3, %edi
+; X86-NEXT: testl %edx, %edx
; X86-NEXT: jne .LBB0_1
; X86-NEXT: # %bb.2: # %BB_udiv-special-cases
; X86-NEXT: bsrl %esi, %eax
@@ -52,18 +52,19 @@ define void @f() nounwind {
; X86-NEXT: addl $32, %eax
; X86-NEXT: jmp .LBB0_3
; X86-NEXT: .LBB0_1:
-; X86-NEXT: bsrl %edi, %eax
+; X86-NEXT: bsrl %edx, %eax
; X86-NEXT: xorl $31, %eax
; X86-NEXT: .LBB0_3: # %BB_udiv-special-cases
; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: testl %edx, %edx
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: jne .LBB0_4
; X86-NEXT: # %bb.5: # %BB_udiv-special-cases
; X86-NEXT: addl $64, %eax
; X86-NEXT: jmp .LBB0_6
; X86-NEXT: .LBB0_4:
-; X86-NEXT: bsrl %edx, %eax
+; X86-NEXT: bsrl %edi, %eax
; X86-NEXT: xorl $31, %eax
; X86-NEXT: addl $32, %eax
; X86-NEXT: .LBB0_6: # %BB_udiv-special-cases
@@ -82,7 +83,6 @@ define void @f() nounwind {
; X86-NEXT: andl $3, %esi
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: xorl $65, %ecx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %esi, %ecx
; X86-NEXT: orl %ebx, %ecx
; X86-NEXT: je .LBB0_11
@@ -92,17 +92,16 @@ define void @f() nounwind {
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %esi, %ebx
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: andl $3, %ebx
; X86-NEXT: movb $65, %cl
; X86-NEXT: subb %al, %cl
-; X86-NEXT: movb %cl, %ch
-; X86-NEXT: andb $7, %ch
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: negb %cl
-; X86-NEXT: movsbl %cl, %eax
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %esi
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -111,31 +110,22 @@ define void @f() nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 120(%esp,%eax), %edi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: notb %cl
-; X86-NEXT: movl 112(%esp,%eax), %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 116(%esp,%eax), %edx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl %eax
-; X86-NEXT: shrl %cl, %eax
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl 112(%esp,%esi), %eax
+; X86-NEXT: movl 116(%esp,%esi), %edx
+; X86-NEXT: movl 120(%esp,%esi), %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: shldl %cl, %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %ebx, %ecx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: je .LBB0_11
; X86-NEXT: # %bb.9: # %udiv-preheader
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: andl $3, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $3, %esi
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -144,25 +134,18 @@ define void @f() nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movb %al, %ch
-; X86-NEXT: andb $7, %ch
-; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $15, %al
+; X86-NEXT: andb $12, %al
; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 72(%esp,%eax), %edx
; X86-NEXT: movl 64(%esp,%eax), %edi
-; X86-NEXT: movl 68(%esp,%eax), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrl %cl, %esi
-; X86-NEXT: notb %cl
-; X86-NEXT: movl 72(%esp,%eax), %ebx
-; X86-NEXT: addl %ebx, %ebx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %edx, %edi
+; X86-NEXT: movl 68(%esp,%eax), %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: shrdl %cl, %edx, %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -180,60 +163,59 @@ define void @f() nounwind {
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB0_10: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shldl $1, %ebx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl %esi, %edx
; X86-NEXT: andl $2, %edx
; X86-NEXT: shrl %edx
; X86-NEXT: leal (%edx,%ebx,2), %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: shldl $1, %edx, %esi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl %edi, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl $1, %eax, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %edi, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %eax, %eax
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: andl $3, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: sbbl %edi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: sbbl %ecx, %esi
-; X86-NEXT: shll $30, %esi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: sarl $30, %edx
-; X86-NEXT: sarl $31, %esi
-; X86-NEXT: shrdl $1, %esi, %edx
-; X86-NEXT: movl %edx, %eax
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: shll $30, %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: sarl $30, %edi
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: shrdl $1, %edx, %edi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: subl %edx, %ebx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: subl %edi, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %esi, %edi
-; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %edx, %ebx
; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: andl $3, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: adcl $-1, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl $3, %esi
-; X86-NEXT: andl $3, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $3, %edi
+; X86-NEXT: andl $3, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %eax
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %edx, %eax
; X86-NEXT: jne .LBB0_10
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index 785b97d8c2402..b2ff06798aad7 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -14,6 +14,7 @@ define i256 @test1(i256 %a) nounwind {
; ILP: # %bb.0:
; ILP-NEXT: movq %rdi, %rax
; ILP-NEXT: leal (%rsi,%rsi), %ecx
+; ILP-NEXT: addb $3, %cl
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
@@ -22,10 +23,9 @@ define i256 @test1(i256 %a) nounwind {
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: addb $3, %cl
; ILP-NEXT: movl %ecx, %edx
; ILP-NEXT: shrb $3, %dl
-; ILP-NEXT: andb $7, %cl
+; ILP-NEXT: andb $24, %dl
; ILP-NEXT: negb %dl
; ILP-NEXT: movsbq %dl, %rdx
; ILP-NEXT: movq -16(%rsp,%rdx), %rsi
@@ -60,13 +60,13 @@ define i256 @test1(i256 %a) nounwind {
; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: addl %esi, %esi
-; HYBRID-NEXT: addb $3, %sil
-; HYBRID-NEXT: movl %esi, %ecx
-; HYBRID-NEXT: andb $7, %cl
-; HYBRID-NEXT: shrb $3, %sil
-; HYBRID-NEXT: negb %sil
-; HYBRID-NEXT: movsbq %sil, %rdx
+; HYBRID-NEXT: leal (%rsi,%rsi), %ecx
+; HYBRID-NEXT: addb $3, %cl
+; HYBRID-NEXT: movl %ecx, %edx
+; HYBRID-NEXT: shrb $3, %dl
+; HYBRID-NEXT: andb $24, %dl
+; HYBRID-NEXT: negb %dl
+; HYBRID-NEXT: movsbq %dl, %rdx
; HYBRID-NEXT: movq -16(%rsp,%rdx), %rsi
; HYBRID-NEXT: movq -8(%rsp,%rdx), %rdi
; HYBRID-NEXT: shldq %cl, %rsi, %rdi
@@ -81,6 +81,7 @@ define i256 @test1(i256 %a) nounwind {
; HYBRID-NEXT: shlq %cl, %rsi
; HYBRID-NEXT: notb %cl
; HYBRID-NEXT: shrq %rdx
+; HYBRID-NEXT: # kill: def $cl killed $cl killed $ecx
; HYBRID-NEXT: shrq %cl, %rdx
; HYBRID-NEXT: orq %rsi, %rdx
; HYBRID-NEXT: movq %rdx, 16(%rax)
@@ -97,13 +98,13 @@ define i256 @test1(i256 %a) nounwind {
; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: addl %esi, %esi
-; BURR-NEXT: addb $3, %sil
-; BURR-NEXT: movl %esi, %ecx
-; BURR-NEXT: andb $7, %cl
-; BURR-NEXT: shrb $3, %sil
-; BURR-NEXT: negb %sil
-; BURR-NEXT: movsbq %sil, %rdx
+; BURR-NEXT: leal (%rsi,%rsi), %ecx
+; BURR-NEXT: addb $3, %cl
+; BURR-NEXT: movl %ecx, %edx
+; BURR-NEXT: shrb $3, %dl
+; BURR-NEXT: andb $24, %dl
+; BURR-NEXT: negb %dl
+; BURR-NEXT: movsbq %dl, %rdx
; BURR-NEXT: movq -16(%rsp,%rdx), %rsi
; BURR-NEXT: movq -8(%rsp,%rdx), %rdi
; BURR-NEXT: shldq %cl, %rsi, %rdi
@@ -118,6 +119,7 @@ define i256 @test1(i256 %a) nounwind {
; BURR-NEXT: shlq %cl, %rsi
; BURR-NEXT: notb %cl
; BURR-NEXT: shrq %rdx
+; BURR-NEXT: # kill: def $cl killed $cl killed $ecx
; BURR-NEXT: shrq %cl, %rdx
; BURR-NEXT: orq %rsi, %rdx
; BURR-NEXT: movq %rdx, 16(%rax)
@@ -126,8 +128,8 @@ define i256 @test1(i256 %a) nounwind {
; SRC-LABEL: test1:
; SRC: # %bb.0:
; SRC-NEXT: movq %rdi, %rax
-; SRC-NEXT: addl %esi, %esi
-; SRC-NEXT: addb $3, %sil
+; SRC-NEXT: leal (%rsi,%rsi), %edx
+; SRC-NEXT: addb $3, %dl
; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
@@ -136,11 +138,11 @@ define i256 @test1(i256 %a) nounwind {
; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movl %esi, %edx
-; SRC-NEXT: andb $7, %dl
-; SRC-NEXT: shrb $3, %sil
-; SRC-NEXT: negb %sil
-; SRC-NEXT: movsbq %sil, %rsi
+; SRC-NEXT: movl %edx, %ecx
+; SRC-NEXT: shrb $3, %cl
+; SRC-NEXT: andb $24, %cl
+; SRC-NEXT: negb %cl
+; SRC-NEXT: movsbq %cl, %rsi
; SRC-NEXT: movq -16(%rsp,%rsi), %rdi
; SRC-NEXT: movq %rdi, %r8
; SRC-NEXT: movl %edx, %ecx
@@ -171,6 +173,7 @@ define i256 @test1(i256 %a) nounwind {
; LIN-NEXT: addb $3, %dl
; LIN-NEXT: movl %edx, %ecx
; LIN-NEXT: shrb $3, %cl
+; LIN-NEXT: andb $24, %cl
; LIN-NEXT: negb %cl
; LIN-NEXT: movsbq %cl, %rsi
; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
@@ -182,7 +185,6 @@ define i256 @test1(i256 %a) nounwind {
; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; LIN-NEXT: movq -32(%rsp,%rsi), %rdi
-; LIN-NEXT: andb $7, %dl
; LIN-NEXT: movq %rdi, %r8
; LIN-NEXT: movl %edx, %ecx
; LIN-NEXT: shlq %cl, %r8
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 4fbe05cd1b2f2..ed1ba5c59e500 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -16,42 +16,36 @@ define void @test_lshr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-NEXT: subl $32, %esp
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, (%esp)
+; i686-NEXT: movl %edx, (%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ecx, %eax
-; i686-NEXT: andb $7, %al
-; i686-NEXT: shrb $3, %cl
-; i686-NEXT: andb $15, %cl
-; i686-NEXT: movzbl %cl, %ebp
-; i686-NEXT: movl 4(%esp,%ebp), %edx
-; i686-NEXT: movl %edx, %esi
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: notb %cl
-; i686-NEXT: movl 8(%esp,%ebp), %ebx
-; i686-NEXT: leal (%ebx,%ebx), %edi
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: orl %esi, %edi
-; i686-NEXT: movl (%esp,%ebp), %esi
-; i686-NEXT: movl 12(%esp,%ebp), %ebp
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrdl %cl, %ebp, %ebx
-; i686-NEXT: shrdl %cl, %edx, %esi
-; i686-NEXT: shrl %cl, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl %ebp, 12(%eax)
-; i686-NEXT: movl %ebx, 8(%eax)
-; i686-NEXT: movl %esi, (%eax)
+; i686-NEXT: movl %ecx, %edx
+; i686-NEXT: shrb $3, %dl
+; i686-NEXT: andb $12, %dl
+; i686-NEXT: movzbl %dl, %ebx
+; i686-NEXT: movl 8(%esp,%ebx), %esi
+; i686-NEXT: movl (%esp,%ebx), %edx
+; i686-NEXT: movl 4(%esp,%ebx), %ebp
+; i686-NEXT: movl %ebp, %edi
+; i686-NEXT: shrdl %cl, %esi, %edi
+; i686-NEXT: movl 12(%esp,%ebx), %ebx
+; i686-NEXT: shrdl %cl, %ebx, %esi
+; i686-NEXT: shrdl %cl, %ebp, %edx
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: shrl %cl, %ebx
+; i686-NEXT: movl %ebx, 12(%eax)
+; i686-NEXT: movl %esi, 8(%eax)
; i686-NEXT: movl %edi, 4(%eax)
+; i686-NEXT: movl %edx, (%eax)
; i686-NEXT: addl $32, %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
@@ -87,43 +81,37 @@ define void @test_ashr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-NEXT: subl $32, %esp
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, (%esp)
+; i686-NEXT: movl %edx, (%esp)
; i686-NEXT: sarl $31, %ebx
; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ecx, %eax
-; i686-NEXT: andb $7, %al
-; i686-NEXT: shrb $3, %cl
-; i686-NEXT: andb $15, %cl
-; i686-NEXT: movzbl %cl, %ebp
-; i686-NEXT: movl 4(%esp,%ebp), %edx
-; i686-NEXT: movl %edx, %esi
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: notb %cl
-; i686-NEXT: movl 8(%esp,%ebp), %ebx
-; i686-NEXT: leal (%ebx,%ebx), %edi
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: orl %esi, %edi
-; i686-NEXT: movl (%esp,%ebp), %esi
-; i686-NEXT: movl 12(%esp,%ebp), %ebp
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrdl %cl, %ebp, %ebx
-; i686-NEXT: shrdl %cl, %edx, %esi
-; i686-NEXT: sarl %cl, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl %ebp, 12(%eax)
-; i686-NEXT: movl %ebx, 8(%eax)
-; i686-NEXT: movl %esi, (%eax)
+; i686-NEXT: movl %ecx, %edx
+; i686-NEXT: shrb $3, %dl
+; i686-NEXT: andb $12, %dl
+; i686-NEXT: movzbl %dl, %ebx
+; i686-NEXT: movl 8(%esp,%ebx), %esi
+; i686-NEXT: movl (%esp,%ebx), %edx
+; i686-NEXT: movl 4(%esp,%ebx), %ebp
+; i686-NEXT: movl %ebp, %edi
+; i686-NEXT: shrdl %cl, %esi, %edi
+; i686-NEXT: movl 12(%esp,%ebx), %ebx
+; i686-NEXT: shrdl %cl, %ebx, %esi
+; i686-NEXT: shrdl %cl, %ebp, %edx
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: sarl %cl, %ebx
+; i686-NEXT: movl %ebx, 12(%eax)
+; i686-NEXT: movl %esi, 8(%eax)
; i686-NEXT: movl %edi, 4(%eax)
+; i686-NEXT: movl %edx, (%eax)
; i686-NEXT: addl $32, %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
@@ -163,44 +151,35 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
+; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, (%esp)
-; i686-NEXT: movl %ecx, %eax
-; i686-NEXT: andb $7, %al
-; i686-NEXT: shrb $3, %cl
-; i686-NEXT: andb $15, %cl
-; i686-NEXT: negb %cl
-; i686-NEXT: movsbl %cl, %ebp
-; i686-NEXT: movl 24(%esp,%ebp), %ebx
-; i686-NEXT: movl %ebx, %edx
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: notb %cl
-; i686-NEXT: movl 20(%esp,%ebp), %edi
-; i686-NEXT: movl %edi, %esi
-; i686-NEXT: shrl %esi
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: orl %edx, %esi
-; i686-NEXT: movl 16(%esp,%ebp), %edx
-; i686-NEXT: movl 28(%esp,%ebp), %ebp
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shldl %cl, %ebx, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %ebp, 12(%ecx)
-; i686-NEXT: movl %edx, %ebx
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shll %cl, %ebx
-; i686-NEXT: shldl %cl, %edx, %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl %edi, 4(%eax)
-; i686-NEXT: movl %ebx, (%eax)
-; i686-NEXT: movl %esi, 8(%eax)
+; i686-NEXT: movl %ecx, %edx
+; i686-NEXT: shrb $3, %dl
+; i686-NEXT: andb $12, %dl
+; i686-NEXT: negb %dl
+; i686-NEXT: movsbl %dl, %edi
+; i686-NEXT: movl 16(%esp,%edi), %edx
+; i686-NEXT: movl 20(%esp,%edi), %esi
+; i686-NEXT: movl 24(%esp,%edi), %ebx
+; i686-NEXT: movl %ebx, %ebp
+; i686-NEXT: shldl %cl, %esi, %ebp
+; i686-NEXT: movl 28(%esp,%edi), %edi
+; i686-NEXT: shldl %cl, %ebx, %edi
+; i686-NEXT: movl %edi, 12(%eax)
+; i686-NEXT: movl %ebp, 8(%eax)
+; i686-NEXT: movl %edx, %edi
+; i686-NEXT: shll %cl, %edi
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: shldl %cl, %edx, %esi
+; i686-NEXT: movl %esi, 4(%eax)
+; i686-NEXT: movl %edi, (%eax)
; i686-NEXT: addl $32, %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
@@ -267,13 +246,13 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $100, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT: subl $92, %esp
; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
+; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
@@ -282,86 +261,70 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %esi, %ecx
-; i686-NEXT: andl $7, %ecx
+; i686-NEXT: movl %edx, %eax
+; i686-NEXT: andl $31, %eax
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: movl 36(%esp,%edx), %edi
+; i686-NEXT: movl 28(%esp,%edx), %ecx
; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shrl $3, %esi
-; i686-NEXT: andl $15, %esi
-; i686-NEXT: movl 40(%esp,%esi), %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shrl %cl, %eax
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 44(%esp,%esi), %edx
-; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: addl %edx, %edx
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: orl %eax, %edx
-; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 36(%esp,%esi), %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 32(%esp,%edx), %ebx
+; i686-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: shrdl %cl, %edi, %ebx
+; i686-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 40(%esp,%edx), %edx
+; i686-NEXT: movl %edx, (%esp) # 4-byte Spill
+; i686-NEXT: shrdl %cl, %edx, %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, %edx
-; i686-NEXT: andl $7, %edx
-; i686-NEXT: shrl $3, %ebx
-; i686-NEXT: andl $15, %ebx
-; i686-NEXT: movl 72(%esp,%ebx), %ebp
-; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: shrl %cl, %ebp
+; i686-NEXT: movl %esi, %edx
+; i686-NEXT: andl $31, %edx
+; i686-NEXT: shrl $3, %esi
+; i686-NEXT: andl $12, %esi
+; i686-NEXT: movl 68(%esp,%esi), %ebp
+; i686-NEXT: movl 64(%esp,%esi), %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 76(%esp,%ebx), %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: leal (%eax,%eax), %edi
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: orl %ebp, %edi
-; i686-NEXT: movl 48(%esp,%esi), %esi
-; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT: shrdl %cl, %ebp, %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 60(%esp,%esi), %edi
+; i686-NEXT: movl 72(%esp,%esi), %esi
+; i686-NEXT: shrdl %cl, %esi, %ebp
; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: movl 68(%esp,%ebx), %ecx
-; i686-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; i686-NEXT: movl 80(%esp,%ebx), %esi
-; i686-NEXT: movl %edx, %ecx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT: shrdl %cl, %esi, %ebx
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT: shrdl %cl, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT: shrl %cl, %ebp
+; i686-NEXT: shrdl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
; i686-NEXT: movl %edx, %ecx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
+; i686-NEXT: shrdl %cl, %eax, %edi
; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %esi, 28(%ecx)
-; i686-NEXT: movl %ebx, 24(%ecx)
-; i686-NEXT: movl (%esp), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 16(%ecx)
-; i686-NEXT: movl %ebp, 12(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 8(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, (%ecx)
-; i686-NEXT: movl %edi, 20(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 4(%ecx)
-; i686-NEXT: addl $100, %esp
+; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT: movl %esi, 28(%eax)
+; i686-NEXT: movl %ebp, 24(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 20(%eax)
+; i686-NEXT: movl %edi, 16(%eax)
+; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 12(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 8(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 4(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, (%eax)
+; i686-NEXT: addl $92, %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -406,102 +369,85 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
; i686-NEXT: subl $92, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
+; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: sarl $31, %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT: sarl $31, %ebp
+; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; i686-NEXT: sarl $31, %eax
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edi, %ebx
-; i686-NEXT: andl $7, %ebx
-; i686-NEXT: shrl $3, %edi
-; i686-NEXT: andl $15, %edi
-; i686-NEXT: movl 32(%esp,%edi), %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: shrl %cl, %eax
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 36(%esp,%edi), %edx
-; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: addl %edx, %edx
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: orl %eax, %edx
-; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ebp, %eax
-; i686-NEXT: movl %ebp, %edx
-; i686-NEXT: andl $7, %edx
-; i686-NEXT: shrl $3, %eax
-; i686-NEXT: andl $15, %eax
-; i686-NEXT: movl 64(%esp,%eax), %ebp
-; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %eax, (%esp) # 4-byte Spill
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: shrl %cl, %ebp
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 68(%esp,%eax), %esi
-; i686-NEXT: leal (%esi,%esi), %eax
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %eax
-; i686-NEXT: orl %ebp, %eax
-; i686-NEXT: movl 28(%esp,%edi), %ecx
+; i686-NEXT: movl %edx, %eax
+; i686-NEXT: andl $31, %eax
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: movl 36(%esp,%edx), %edi
+; i686-NEXT: movl 28(%esp,%edx), %ecx
; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 40(%esp,%edi), %edi
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT: movl 60(%esp,%ecx), %ebp
-; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 72(%esp,%ecx), %ebp
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: shrdl %cl, %ebp, %esi
-; i686-NEXT: movl %esi, (%esp) # 4-byte Spill
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: sarl %cl, %edi
+; i686-NEXT: movl 32(%esp,%edx), %ebx
+; i686-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: shrdl %cl, %edi, %ebx
+; i686-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 40(%esp,%edx), %edx
+; i686-NEXT: movl %edx, (%esp) # 4-byte Spill
+; i686-NEXT: shrdl %cl, %edx, %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %esi, %edx
+; i686-NEXT: andl $31, %edx
+; i686-NEXT: shrl $3, %esi
+; i686-NEXT: andl $12, %esi
+; i686-NEXT: movl 68(%esp,%esi), %ebp
+; i686-NEXT: movl 64(%esp,%esi), %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; i686-NEXT: shrdl %cl, %ebp, %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 60(%esp,%esi), %edi
+; i686-NEXT: movl 72(%esp,%esi), %esi
+; i686-NEXT: shrdl %cl, %esi, %ebp
+; i686-NEXT: movl %eax, %ecx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT: shrdl %cl, %esi, %ebx
-; i686-NEXT: sarl %cl, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %ebp, 28(%ecx)
-; i686-NEXT: movl (%esp), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 24(%ecx)
-; i686-NEXT: movl %ebx, 16(%ecx)
-; i686-NEXT: movl %edi, 12(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 8(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, (%ecx)
-; i686-NEXT: movl %eax, 20(%ecx)
+; i686-NEXT: shrdl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT: sarl %cl, (%esp) # 4-byte Folded Spill
+; i686-NEXT: movl %edx, %ecx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 4(%ecx)
+; i686-NEXT: shrdl %cl, %eax, %edi
+; i686-NEXT: sarl %cl, %esi
+; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT: movl %esi, 28(%eax)
+; i686-NEXT: movl %ebp, 24(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 20(%eax)
+; i686-NEXT: movl %edi, 16(%eax)
+; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 12(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 8(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 4(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, (%eax)
; i686-NEXT: addl $92, %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
@@ -550,107 +496,97 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
; i686-NEXT: subl $100, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebp, %ecx
-; i686-NEXT: shrl $3, %ebp
-; i686-NEXT: andl $15, %ebp
-; i686-NEXT: leal {{[0-9]+}}(%esp), %eax
-; i686-NEXT: subl %ebp, %eax
+; i686-NEXT: movl %ecx, %ebx
+; i686-NEXT: shrl $3, %ebx
+; i686-NEXT: andl $12, %ebx
+; i686-NEXT: leal {{[0-9]+}}(%esp), %edx
+; i686-NEXT: subl %ebx, %edx
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl 8(%eax), %edx
-; i686-NEXT: movl %edx, (%esp) # 4-byte Spill
-; i686-NEXT: andl $7, %ecx
-; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: movl 4(%eax), %esi
+; i686-NEXT: movl (%edx), %esi
; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shrl %esi
-; i686-NEXT: notl %ecx
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: orl %edx, %esi
+; i686-NEXT: movl 4(%edx), %esi
; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl (%eax), %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ebx, %edx
-; i686-NEXT: shrl $3, %edx
-; i686-NEXT: andl $15, %edx
-; i686-NEXT: leal {{[0-9]+}}(%esp), %esi
-; i686-NEXT: subl %edx, %esi
+; i686-NEXT: movl 8(%edx), %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: andl $31, %ecx
+; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: shldl %cl, %esi, %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %eax, %ebp
+; i686-NEXT: shrl $3, %ebp
+; i686-NEXT: andl $12, %ebp
+; i686-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; i686-NEXT: subl %ebp, %ecx
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: andl $7, %ebx
-; i686-NEXT: movl 8(%esi), %edi
-; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: movl 4(%esi), %eax
+; i686-NEXT: movl (%ecx), %edx
+; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 4(%ecx), %edi
+; i686-NEXT: movl 8(%ecx), %esi
+; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: andl $31, %eax
+; i686-NEXT: movl %eax, (%esp) # 4-byte Spill
+; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: shldl %cl, %edi, %esi
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; i686-NEXT: movl %edx, %eax
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: shll %cl, %eax
; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shrl %eax
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: notl %ecx
+; i686-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT: negl %ebx
+; i686-NEXT: movl 64(%esp,%ebx), %ebx
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shrl %cl, %eax
-; i686-NEXT: orl %edi, %eax
-; i686-NEXT: movl (%esi), %ecx
-; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT: movl %esi, %edi
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT: shldl %cl, %eax, %ebx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT: movl %eax, %edx
+; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT: shll %cl, %edx
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: shldl %cl, %eax, %edi
; i686-NEXT: negl %ebp
-; i686-NEXT: movl 64(%esp,%ebp), %esi
+; i686-NEXT: movl 96(%esp,%ebp), %ebp
+; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: movl (%esp), %edi # 4-byte Reload
-; i686-NEXT: shldl %cl, %edi, %esi
-; i686-NEXT: movl %esi, (%esp) # 4-byte Spill
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT: movl %esi, %edi
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT: shldl %cl, %esi, %ebp
-; i686-NEXT: negl %edx
-; i686-NEXT: movl 96(%esp,%edx), %edx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT: shldl %cl, %ebx, %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %edx, 28(%ecx)
-; i686-NEXT: movl %ebp, 20(%ecx)
-; i686-NEXT: movl %edi, 16(%ecx)
-; i686-NEXT: movl (%esp), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 12(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 4(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, (%ecx)
-; i686-NEXT: movl %eax, 24(%ecx)
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 8(%ecx)
+; i686-NEXT: shldl %cl, %eax, %ebp
+; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT: movl %ebp, 28(%eax)
+; i686-NEXT: movl %esi, 24(%eax)
+; i686-NEXT: movl %edi, 20(%eax)
+; i686-NEXT: movl %edx, 16(%eax)
+; i686-NEXT: movl %ebx, 12(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 8(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 4(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, (%eax)
; i686-NEXT: addl $100, %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index e1466aebf4225..bf159acc43f91 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -11,7 +11,7 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $92, %esp
+; CHECK-NEXT: subl $80, %esp
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -39,67 +39,43 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: andb $7, %al
-; CHECK-NEXT: shrb $3, %cl
-; CHECK-NEXT: movzbl %cl, %ebp
-; CHECK-NEXT: movl 32(%esp,%ebp), %esi
-; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl %cl, %esi
-; CHECK-NEXT: movl %eax, %edx
-; CHECK-NEXT: notb %dl
-; CHECK-NEXT: movl 36(%esp,%ebp), %ecx
-; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: leal (%ecx,%ecx), %edi
-; CHECK-NEXT: movl %edx, %ecx
-; CHECK-NEXT: shll %cl, %edi
-; CHECK-NEXT: orl %esi, %edi
-; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 40(%esp,%ebp), %esi
-; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl %cl, %esi
-; CHECK-NEXT: movl 44(%esp,%ebp), %ecx
-; CHECK-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; CHECK-NEXT: leal (%ecx,%ecx), %edi
-; CHECK-NEXT: movl %edx, %ecx
-; CHECK-NEXT: shll %cl, %edi
-; CHECK-NEXT: orl %esi, %edi
-; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 48(%esp,%ebp), %ebx
-; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl %cl, %ebx
-; CHECK-NEXT: movl 52(%esp,%ebp), %edi
-; CHECK-NEXT: leal (%edi,%edi), %esi
-; CHECK-NEXT: movl %edx, %ecx
-; CHECK-NEXT: shll %cl, %esi
-; CHECK-NEXT: orl %ebx, %esi
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; CHECK-NEXT: movl 28(%esp,%ebp), %edx
-; CHECK-NEXT: movl 56(%esp,%ebp), %ebx
-; CHECK-NEXT: shrdl %cl, %ebx, %edi
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; CHECK-NEXT: shrdl %cl, %ebp, %edx
-; CHECK-NEXT: sarl %cl, %ebx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl %ebx, 28(%eax)
-; CHECK-NEXT: movl %edi, 24(%eax)
-; CHECK-NEXT: movl (%esp), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 16(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 8(%eax)
-; CHECK-NEXT: movl %edx, (%eax)
-; CHECK-NEXT: movl %esi, 20(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 12(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 4(%eax)
-; CHECK-NEXT: addl $92, %esp
+; CHECK-NEXT: shrb $5, %al
+; CHECK-NEXT: movzbl %al, %ebp
+; CHECK-NEXT: movl 24(%esp,%ebp,4), %eax
+; CHECK-NEXT: movl 20(%esp,%ebp,4), %edx
+; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: shrdl %cl, %eax, %edx
+; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 28(%esp,%ebp,4), %edx
+; CHECK-NEXT: shrdl %cl, %edx, %eax
+; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 32(%esp,%ebp,4), %ebx
+; CHECK-NEXT: shrdl %cl, %ebx, %edx
+; CHECK-NEXT: movl %edx, (%esp) # 4-byte Spill
+; CHECK-NEXT: movl 36(%esp,%ebp,4), %edx
+; CHECK-NEXT: shrdl %cl, %edx, %ebx
+; CHECK-NEXT: movl 40(%esp,%ebp,4), %eax
+; CHECK-NEXT: shrdl %cl, %eax, %edx
+; CHECK-NEXT: movl 16(%esp,%ebp,4), %esi
+; CHECK-NEXT: movl 44(%esp,%ebp,4), %ebp
+; CHECK-NEXT: shrdl %cl, %ebp, %eax
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT: shrdl %cl, %edi, %esi
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: sarl %cl, %ebp
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl %ebp, 28(%ecx)
+; CHECK-NEXT: movl %eax, 24(%ecx)
+; CHECK-NEXT: movl %edx, 20(%ecx)
+; CHECK-NEXT: movl %ebx, 16(%ecx)
+; CHECK-NEXT: movl (%esp), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, 12(%ecx)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, 8(%ecx)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, 4(%ecx)
+; CHECK-NEXT: movl %esi, (%ecx)
+; CHECK-NEXT: addl $80, %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
@@ -120,42 +96,35 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movb %r8b, %dl
-; CHECK-X64-O0-NEXT: movb %dl, %cl
-; CHECK-X64-O0-NEXT: andb $7, %cl
+; CHECK-X64-O0-NEXT: movb %r8b, %cl
; CHECK-X64-O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-X64-O0-NEXT: shrb $3, %dl
+; CHECK-X64-O0-NEXT: movb %cl, %dl
+; CHECK-X64-O0-NEXT: shrb $6, %dl
; CHECK-X64-O0-NEXT: movzbl %dl, %edx
; CHECK-X64-O0-NEXT: movl %edx, %edi
-; CHECK-X64-O0-NEXT: movq -64(%rsp,%rdi), %rdx
-; CHECK-X64-O0-NEXT: movq -56(%rsp,%rdi), %r8
-; CHECK-X64-O0-NEXT: movq %r8, %r9
-; CHECK-X64-O0-NEXT: shrq %cl, %r9
+; CHECK-X64-O0-NEXT: movq -48(%rsp,%rdi,8), %rsi
+; CHECK-X64-O0-NEXT: movq -64(%rsp,%rdi,8), %r8
+; CHECK-X64-O0-NEXT: movq -56(%rsp,%rdi,8), %r9
+; CHECK-X64-O0-NEXT: movq %r9, %rdx
+; CHECK-X64-O0-NEXT: shrdq %cl, %rsi, %rdx
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: notb %cl
-; CHECK-X64-O0-NEXT: movq -48(%rsp,%rdi), %rsi
-; CHECK-X64-O0-NEXT: movq %rsi, %r10
-; CHECK-X64-O0-NEXT: addq %r10, %r10
-; CHECK-X64-O0-NEXT: shlq %cl, %r10
-; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: orq %r10, %r9
-; CHECK-X64-O0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-X64-O0-NEXT: movq -40(%rsp,%rdi), %rdi
+; CHECK-X64-O0-NEXT: movq -40(%rsp,%rdi,8), %rdi
; CHECK-X64-O0-NEXT: shrdq %cl, %rdi, %rsi
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: shrdq %cl, %r8, %rdx
+; CHECK-X64-O0-NEXT: shrdq %cl, %r9, %r8
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-X64-O0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-X64-O0-NEXT: sarq %cl, %rdi
; CHECK-X64-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; CHECK-X64-O0-NEXT: movq %rdi, 24(%rax)
; CHECK-X64-O0-NEXT: movq %rsi, 16(%rax)
-; CHECK-X64-O0-NEXT: movq %rdx, (%rax)
-; CHECK-X64-O0-NEXT: movq %rcx, 8(%rax)
+; CHECK-X64-O0-NEXT: movq %rdx, 8(%rax)
+; CHECK-X64-O0-NEXT: movq %rcx, (%rax)
; CHECK-X64-O0-NEXT: retq
;
; CHECK-X64-O2-LABEL: shift1:
; CHECK-X64-O2: # %bb.0: # %entry
-; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
@@ -165,29 +134,23 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movl %r8d, %eax
-; CHECK-X64-O2-NEXT: andb $7, %al
-; CHECK-X64-O2-NEXT: shrb $3, %r8b
-; CHECK-X64-O2-NEXT: movzbl %r8b, %edx
-; CHECK-X64-O2-NEXT: movq -64(%rsp,%rdx), %rsi
-; CHECK-X64-O2-NEXT: movq -56(%rsp,%rdx), %rdi
-; CHECK-X64-O2-NEXT: movq %rdi, %r8
-; CHECK-X64-O2-NEXT: movl %eax, %ecx
-; CHECK-X64-O2-NEXT: shrq %cl, %r8
-; CHECK-X64-O2-NEXT: notb %cl
-; CHECK-X64-O2-NEXT: movq -48(%rsp,%rdx), %r10
-; CHECK-X64-O2-NEXT: leaq (%r10,%r10), %r11
-; CHECK-X64-O2-NEXT: shlq %cl, %r11
-; CHECK-X64-O2-NEXT: orq %r8, %r11
-; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx), %rdx
-; CHECK-X64-O2-NEXT: movl %eax, %ecx
-; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %r10
-; CHECK-X64-O2-NEXT: shrdq %cl, %rdi, %rsi
+; CHECK-X64-O2-NEXT: movl %r8d, %ecx
+; CHECK-X64-O2-NEXT: shrb $6, %cl
+; CHECK-X64-O2-NEXT: movzbl %cl, %edx
+; CHECK-X64-O2-NEXT: movq -48(%rsp,%rdx,8), %rsi
+; CHECK-X64-O2-NEXT: movq -64(%rsp,%rdx,8), %rdi
+; CHECK-X64-O2-NEXT: movq -56(%rsp,%rdx,8), %r9
+; CHECK-X64-O2-NEXT: movq %r9, %r10
+; CHECK-X64-O2-NEXT: movl %r8d, %ecx
+; CHECK-X64-O2-NEXT: shrdq %cl, %rsi, %r10
+; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx,8), %rdx
+; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %rsi
+; CHECK-X64-O2-NEXT: shrdq %cl, %r9, %rdi
; CHECK-X64-O2-NEXT: sarq %cl, %rdx
-; CHECK-X64-O2-NEXT: movq %rdx, 24(%r9)
-; CHECK-X64-O2-NEXT: movq %r10, 16(%r9)
-; CHECK-X64-O2-NEXT: movq %rsi, (%r9)
-; CHECK-X64-O2-NEXT: movq %r11, 8(%r9)
+; CHECK-X64-O2-NEXT: movq %rdx, 24(%rax)
+; CHECK-X64-O2-NEXT: movq %rsi, 16(%rax)
+; CHECK-X64-O2-NEXT: movq %r10, 8(%rax)
+; CHECK-X64-O2-NEXT: movq %rdi, (%rax)
; CHECK-X64-O2-NEXT: retq
entry:
%0 = ashr i256 %x, %a
@@ -202,8 +165,8 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $92, %esp
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: subl $80, %esp
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -220,68 +183,52 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movb %al, %ch
-; CHECK-NEXT: andb $7, %ch
+; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: shrb $3, %al
+; CHECK-NEXT: andb $28, %al
; CHECK-NEXT: negb %al
; CHECK-NEXT: movsbl %al, %eax
-; CHECK-NEXT: movl 68(%esp,%eax), %edx
-; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movb %ch, %cl
-; CHECK-NEXT: shll %cl, %edx
-; CHECK-NEXT: notb %cl
-; CHECK-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; CHECK-NEXT: movl 64(%esp,%eax), %ebp
-; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: shrl %ebp
-; CHECK-NEXT: shrl %cl, %ebp
-; CHECK-NEXT: orl %edx, %ebp
-; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 76(%esp,%eax), %edx
-; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movb %ch, %cl
-; CHECK-NEXT: shll %cl, %edx
-; CHECK-NEXT: movl 72(%esp,%eax), %ebx
-; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: shrl %ebx
-; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-NEXT: shrl %cl, %ebx
-; CHECK-NEXT: orl %edx, %ebx
-; CHECK-NEXT: movl 84(%esp,%eax), %esi
+; CHECK-NEXT: movl 52(%esp,%eax), %esi
; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movb %ch, %cl
-; CHECK-NEXT: shll %cl, %esi
-; CHECK-NEXT: movl 80(%esp,%eax), %edi
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: shrl %edx
-; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-NEXT: shrl %cl, %edx
-; CHECK-NEXT: orl %esi, %edx
-; CHECK-NEXT: movb %ch, %cl
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT: movl 56(%esp,%eax), %edx
+; CHECK-NEXT: movl %edx, %edi
; CHECK-NEXT: shldl %cl, %esi, %edi
-; CHECK-NEXT: movl 60(%esp,%eax), %ebp
-; CHECK-NEXT: movl 88(%esp,%eax), %esi
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-NEXT: shldl %cl, %eax, %esi
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 60(%esp,%eax), %esi
+; CHECK-NEXT: movl %esi, %edi
+; CHECK-NEXT: shldl %cl, %edx, %edi
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 64(%esp,%eax), %edx
+; CHECK-NEXT: movl %edx, %ebp
+; CHECK-NEXT: shldl %cl, %esi, %ebp
+; CHECK-NEXT: movl 68(%esp,%eax), %esi
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: shldl %cl, %edx, %ebx
+; CHECK-NEXT: movl 72(%esp,%eax), %edi
+; CHECK-NEXT: movl %edi, %edx
+; CHECK-NEXT: shldl %cl, %esi, %edx
+; CHECK-NEXT: movl 48(%esp,%eax), %esi
+; CHECK-NEXT: movl %esi, (%esp) # 4-byte Spill
+; CHECK-NEXT: movl 76(%esp,%eax), %esi
+; CHECK-NEXT: shldl %cl, %edi, %esi
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %esi, 28(%eax)
-; CHECK-NEXT: movl %edi, 20(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-NEXT: movl %esi, 12(%eax)
-; CHECK-NEXT: movl %ebp, %esi
-; CHECK-NEXT: shll %cl, %esi
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; CHECK-NEXT: shldl %cl, %ebp, %edi
-; CHECK-NEXT: movl %edi, 4(%eax)
-; CHECK-NEXT: movl %esi, (%eax)
; CHECK-NEXT: movl %edx, 24(%eax)
-; CHECK-NEXT: movl %ebx, 16(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 8(%eax)
-; CHECK-NEXT: addl $92, %esp
+; CHECK-NEXT: movl %ebx, 20(%eax)
+; CHECK-NEXT: movl %ebp, 16(%eax)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT: movl %edx, 12(%eax)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT: movl %edx, 8(%eax)
+; CHECK-NEXT: movl (%esp), %edi # 4-byte Reload
+; CHECK-NEXT: movl %edi, %edx
+; CHECK-NEXT: shll %cl, %edx
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT: shldl %cl, %edi, %esi
+; CHECK-NEXT: movl %esi, 4(%eax)
+; CHECK-NEXT: movl %edx, (%eax)
+; CHECK-NEXT: addl $80, %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
@@ -299,42 +246,37 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movb %sil, %dl
-; CHECK-X64-O0-NEXT: movb %dl, %cl
-; CHECK-X64-O0-NEXT: andb $7, %cl
+; CHECK-X64-O0-NEXT: movb %sil, %cl
; CHECK-X64-O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-X64-O0-NEXT: movb %cl, %dl
; CHECK-X64-O0-NEXT: shrb $3, %dl
+; CHECK-X64-O0-NEXT: andb $24, %dl
; CHECK-X64-O0-NEXT: negb %dl
-; CHECK-X64-O0-NEXT: movsbq %dl, %rdx
-; CHECK-X64-O0-NEXT: movq -16(%rsp,%rdx), %rsi
-; CHECK-X64-O0-NEXT: movq %rsi, %r10
-; CHECK-X64-O0-NEXT: shlq %cl, %r10
+; CHECK-X64-O0-NEXT: movsbq %dl, %r8
+; CHECK-X64-O0-NEXT: movq -32(%rsp,%r8), %r9
+; CHECK-X64-O0-NEXT: movq -24(%rsp,%r8), %rdx
+; CHECK-X64-O0-NEXT: movq -16(%rsp,%r8), %r10
+; CHECK-X64-O0-NEXT: movq %r10, %rsi
+; CHECK-X64-O0-NEXT: shldq %cl, %rdx, %rsi
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: notb %cl
-; CHECK-X64-O0-NEXT: movq -32(%rsp,%rdx), %r9
-; CHECK-X64-O0-NEXT: movq -24(%rsp,%rdx), %r8
-; CHECK-X64-O0-NEXT: movq %r8, %r11
-; CHECK-X64-O0-NEXT: shrq %r11
-; CHECK-X64-O0-NEXT: shrq %cl, %r11
+; CHECK-X64-O0-NEXT: movq -8(%rsp,%r8), %r8
+; CHECK-X64-O0-NEXT: shldq %cl, %r10, %r8
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: orq %r11, %r10
-; CHECK-X64-O0-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-X64-O0-NEXT: movq -8(%rsp,%rdx), %rdx
-; CHECK-X64-O0-NEXT: shldq %cl, %rsi, %rdx
-; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: movq %r9, %rsi
-; CHECK-X64-O0-NEXT: shlq %cl, %rsi
+; CHECK-X64-O0-NEXT: movq %r9, %r10
+; CHECK-X64-O0-NEXT: shlq %cl, %r10
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: shldq %cl, %r9, %r8
+; CHECK-X64-O0-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-X64-O0-NEXT: shldq %cl, %r9, %rdx
; CHECK-X64-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; CHECK-X64-O0-NEXT: movq %r8, 8(%rdi)
-; CHECK-X64-O0-NEXT: movq %rsi, (%rdi)
-; CHECK-X64-O0-NEXT: movq %rdx, 24(%rdi)
-; CHECK-X64-O0-NEXT: movq %rcx, 16(%rdi)
+; CHECK-X64-O0-NEXT: movq %r8, 24(%rdi)
+; CHECK-X64-O0-NEXT: movq %rsi, 16(%rdi)
+; CHECK-X64-O0-NEXT: movq %rdx, 8(%rdi)
+; CHECK-X64-O0-NEXT: movq %rcx, (%rdi)
; CHECK-X64-O0-NEXT: retq
;
; CHECK-X64-O2-LABEL: shift2:
; CHECK-X64-O2: # %bb.0:
+; CHECK-X64-O2-NEXT: movq %rsi, %rcx
; CHECK-X64-O2-NEXT: movq %rdi, %rax
; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
@@ -344,32 +286,26 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movl %esi, %edx
-; CHECK-X64-O2-NEXT: andb $7, %dl
-; CHECK-X64-O2-NEXT: shrb $3, %sil
-; CHECK-X64-O2-NEXT: negb %sil
-; CHECK-X64-O2-NEXT: movsbq %sil, %rsi
-; CHECK-X64-O2-NEXT: movq -16(%rsp,%rsi), %rdi
-; CHECK-X64-O2-NEXT: movq %rdi, %r8
-; CHECK-X64-O2-NEXT: movl %edx, %ecx
+; CHECK-X64-O2-NEXT: movl %ecx, %edx
+; CHECK-X64-O2-NEXT: shrb $3, %dl
+; CHECK-X64-O2-NEXT: andb $24, %dl
+; CHECK-X64-O2-NEXT: negb %dl
+; CHECK-X64-O2-NEXT: movsbq %dl, %rdx
+; CHECK-X64-O2-NEXT: movq -32(%rsp,%rdx), %rsi
+; CHECK-X64-O2-NEXT: movq -24(%rsp,%rdx), %rdi
+; CHECK-X64-O2-NEXT: movq -16(%rsp,%rdx), %r8
+; CHECK-X64-O2-NEXT: movq %r8, %r9
+; CHECK-X64-O2-NEXT: shldq %cl, %rdi, %r9
+; CHECK-X64-O2-NEXT: movq -8(%rsp,%rdx), %rdx
+; CHECK-X64-O2-NEXT: shldq %cl, %r8, %rdx
+; CHECK-X64-O2-NEXT: movq %rsi, %r8
; CHECK-X64-O2-NEXT: shlq %cl, %r8
-; CHECK-X64-O2-NEXT: notb %cl
-; CHECK-X64-O2-NEXT: movq -32(%rsp,%rsi), %r9
-; CHECK-X64-O2-NEXT: movq -24(%rsp,%rsi), %r10
-; CHECK-X64-O2-NEXT: movq %r10, %r11
-; CHECK-X64-O2-NEXT: shrq %r11
-; CHECK-X64-O2-NEXT: shrq %cl, %r11
-; CHECK-X64-O2-NEXT: orq %r8, %r11
-; CHECK-X64-O2-NEXT: movq -8(%rsp,%rsi), %rsi
-; CHECK-X64-O2-NEXT: movl %edx, %ecx
-; CHECK-X64-O2-NEXT: shldq %cl, %rdi, %rsi
-; CHECK-X64-O2-NEXT: movq %r9, %rdi
-; CHECK-X64-O2-NEXT: shlq %cl, %rdi
-; CHECK-X64-O2-NEXT: shldq %cl, %r9, %r10
-; CHECK-X64-O2-NEXT: movq %rsi, 24(%rax)
-; CHECK-X64-O2-NEXT: movq %r10, 8(%rax)
-; CHECK-X64-O2-NEXT: movq %rdi, (%rax)
-; CHECK-X64-O2-NEXT: movq %r11, 16(%rax)
+; CHECK-X64-O2-NEXT: # kill: def $cl killed $cl killed $rcx
+; CHECK-X64-O2-NEXT: shldq %cl, %rsi, %rdi
+; CHECK-X64-O2-NEXT: movq %rdx, 24(%rax)
+; CHECK-X64-O2-NEXT: movq %r9, 16(%rax)
+; CHECK-X64-O2-NEXT: movq %rdi, 8(%rax)
+; CHECK-X64-O2-NEXT: movq %r8, (%rax)
; CHECK-X64-O2-NEXT: retq
{
%b = shl i256 1, %c ; %c must not be a constant
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index f84131dfc8797..5c9c81758d633 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -588,61 +588,60 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $32, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %ah
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ah, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%eax), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $32, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -658,44 +657,35 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movb (%eax), %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %ah, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %ah, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
@@ -718,43 +708,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %bl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
@@ -771,43 +761,38 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
@@ -899,66 +884,64 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $36, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ebp), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -967,58 +950,47 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X86-NO-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %dl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: shl_16bytes:
@@ -1033,28 +1005,28 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %al, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
@@ -1081,57 +1053,47 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X86-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %dl, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edi), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1218,62 +1180,61 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $32, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 8(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $32, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1289,45 +1250,36 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
@@ -1349,45 +1301,45 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
@@ -1404,44 +1356,39 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
@@ -1459,36 +1406,36 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8,8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8,8), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
@@ -1496,142 +1443,127 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r10,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax,8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rsi,8), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
@@ -1640,17 +1572,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi
@@ -1662,7 +1594,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1672,95 +1604,94 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp,4), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebp,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%ebx,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ebx,4), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $92, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1773,7 +1704,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $80, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax
@@ -1806,67 +1737,44 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $80, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -1880,31 +1788,29 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1913,68 +1819,67 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 20(%esp,%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 20(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
@@ -1988,32 +1893,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $80, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -2022,58 +1925,46 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $80, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -2089,31 +1980,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: negb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %sil, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: negb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r10), %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r10), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
@@ -2146,79 +2038,72 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT: negb %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movsbq %sil, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $24, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: negb %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movsbq %al, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rsi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r8, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %cl, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax
@@ -2226,50 +2111,41 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negb %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movsbq %sil, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $24, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movsbq %al, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r8, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
@@ -2278,7 +2154,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $84, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
@@ -2288,7 +2164,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch
; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi
@@ -2310,79 +2186,78 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $28, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: negb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %al, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -2398,7 +2273,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $84, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -2411,7 +2286,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $80, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax
@@ -2444,69 +2319,45 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $28, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %al, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $80, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -2520,31 +2371,29 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $88, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -2553,66 +2402,72 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%edx), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 84(%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 84(%esp,%esi), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
@@ -2631,32 +2486,30 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $80, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -2665,61 +2518,48 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $28, %al
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%esi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%esi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%esi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%esi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%esi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $80, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -2735,36 +2575,36 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8,8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8,8), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
@@ -2773,145 +2613,130 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r10,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax,8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: sarq $63, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rsi,8), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rcx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rsi, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
@@ -2920,17 +2745,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx
@@ -2942,7 +2767,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -2953,95 +2778,94 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp,4), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebp,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%ebx,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ebx,4), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $92, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -3054,7 +2878,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $80, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax
@@ -3088,67 +2912,44 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $80, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -3162,104 +2963,99 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 20(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 20(%esp,%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ebx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
@@ -3273,93 +3069,79 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $80, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $80, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -3407,9 +3189,9 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d
; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi
@@ -3417,7 +3199,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
@@ -3426,7 +3207,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
@@ -3488,22 +3269,19 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -3511,8 +3289,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, (%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
@@ -3520,64 +3296,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rdi), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rdi), %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -3606,7 +3360,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
@@ -3616,42 +3370,41 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d
; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11
@@ -3662,10 +3415,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
@@ -3676,11 +3429,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
@@ -3699,8 +3449,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
@@ -3709,51 +3457,40 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rbx,%rbx), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r13,%r13), %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r15, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r12, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -3762,42 +3499,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $208, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3806,8 +3543,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3816,6 +3551,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -3834,196 +3571,195 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 128(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 132(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 136(%esp,%esi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%esp), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 140(%esp,%esi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $192, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -4036,62 +3772,62 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -4108,137 +3844,92 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 52(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $176, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -4251,42 +3942,44 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $184, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4307,8 +4000,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -4326,146 +4019,138 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 56(%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%edx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $184, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -4478,7 +4163,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $176, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4489,7 +4174,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
@@ -4499,7 +4184,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
@@ -4514,7 +4199,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -4524,7 +4209,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -4534,8 +4219,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -4552,120 +4235,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %ecx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, (%esp) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 48(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 52(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $176, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -4680,7 +4333,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13
@@ -4712,100 +4364,91 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: negl %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r14), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r14), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rbx), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r14), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rbx), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r14), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rbx), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r14), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r14), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rbx), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r14), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rbx), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r15
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r14), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%rbx), %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rbx), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r13
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 48(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, 56(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
@@ -4815,7 +4458,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
@@ -4831,69 +4474,41 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
; X64-NO-BMI2-HAVE-SHLD-NEXT: negl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r10), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r10), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r10), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r10), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r10), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r10), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%r10), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r12, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 40(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r9), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r9), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r9), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r9), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r9), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes:
@@ -4922,7 +4537,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
@@ -4932,57 +4547,54 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r15, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r11, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %bpl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r13b
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r10, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r15, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rbx, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %r11, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r8, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r13, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r12, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 56(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
@@ -4994,11 +4606,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
@@ -5017,8 +4625,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
@@ -5027,56 +4633,40 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rdi, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r8, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %rbx, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r12, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r11, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r8), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r8), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r8), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
@@ -5086,41 +4676,41 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: subl $192, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5137,13 +4727,15 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: subl %eax, %ebp
@@ -5163,174 +4755,165 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %ch
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, (%esp) # 1-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: negl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%edi), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: negl %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%ecx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 56(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 56(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 48(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 52(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 48(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 52(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 40(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5366,67 +4949,67 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%ecx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%ecx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%ecx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %edx, %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -5443,136 +5026,87 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 160(%esp,%edx), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 60(%edx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 52(%edx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $176, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -5585,45 +5119,45 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $216, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $192, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -5641,13 +5175,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -5664,156 +5198,139 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, (%esp), %ebx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 212(%esp,%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 36(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 28(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 56(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 48(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 40(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 32(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $216, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $192, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -5826,41 +5343,41 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $180, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%ebx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%ebx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%ebx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%ebx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ebx), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -5882,13 +5399,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %ebx, %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -5905,126 +5421,88 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 164(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 44(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 56(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $180, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -6072,9 +5550,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d
; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi
@@ -6082,7 +5560,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
@@ -6091,7 +5568,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
@@ -6153,22 +5630,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -6176,74 +5650,50 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, (%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rdi), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rdi), %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes:
@@ -6281,44 +5731,43 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d
; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11
@@ -6329,10 +5778,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
@@ -6343,11 +5792,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
@@ -6376,52 +5822,39 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rbx,%rbx), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r13,%r13), %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r15, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r12, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
@@ -6430,12 +5863,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $192, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
@@ -6443,7 +5876,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
@@ -6452,19 +5885,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6473,7 +5906,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6482,7 +5915,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -6503,196 +5936,195 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 128(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 132(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 136(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%esp), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 140(%esp,%esi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $192, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -6705,7 +6137,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $176, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6718,7 +6150,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx
@@ -6726,189 +6158,144 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 52(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $176, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -6921,7 +6308,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $188, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6942,201 +6329,201 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 60(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $188, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -7149,7 +6536,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $176, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -7158,7 +6545,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
@@ -7170,173 +6557,142 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 40(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 32(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 16(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 52(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $176, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
>From 306ebacce1d4bf026c47a9adf65981f9a1e15434 Mon Sep 17 00:00:00 2001
From: Gergely Futo <gergely.futo at hightec-rt.com>
Date: Tue, 25 Jun 2024 15:02:18 +0200
Subject: [PATCH 3/4] Addressing review comments
---
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index cd40df473c67c..015d3b412715b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4542,8 +4542,10 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
unsigned IsFast = 0;
const bool AllowsFastMisalignedMemoryAccesses =
TLI.allowsMisalignedMemoryAccesses(
- LoadStoreVT, /*AddrSpace*/ 0, /*Alignment*/ Align(1),
- /*Flags*/ MachineMemOperand::MONone, &IsFast) &&
+ LoadStoreVT, /*AddrSpace=*/DAG.getDataLayout().getAllocaAddrSpace(),
+ /*Alignment=*/Align(LoadStoreVT.getStoreSize()),
+ /*Flags=*/MachineMemOperand::MOLoad | MachineMemOperand::MOStore,
+ &IsFast) &&
IsFast;
if (AllowsFastMisalignedMemoryAccesses && KnownTrailingZeros >= 3)
return Align(1);
@@ -4552,9 +4554,7 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
}();
const unsigned ShiftUnitInBits = LoadStoreAlign.value() * 8;
- const bool IsOneStepShift =
- DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >=
- Log2_32(ShiftUnitInBits);
+ const bool IsOneStepShift = KnownTrailingZeros >= Log2_32(ShiftUnitInBits);
// If we can't do it as one step, we'll have two uses of shift amount,
// and thus must freeze it.
>From 5794d2c4a61d932b38cf8453c3cf33fafdd8e37f Mon Sep 17 00:00:00 2001
From: Gergely Futo <gergely.futo at hightec-rt.com>
Date: Wed, 3 Jul 2024 08:24:41 +0200
Subject: [PATCH 4/4] Use native register width as shifting unit
---
.../SelectionDAG/LegalizeIntegerTypes.cpp | 40 +-
...lar-shift-by-byte-multiple-legalization.ll | 120 +-
.../AArch64/wide-scalar-shift-legalization.ll | 118 +-
llvm/test/CodeGen/Mips/llvm-ir/ashr.ll | 64 +-
llvm/test/CodeGen/Mips/llvm-ir/lshr.ll | 180 +-
llvm/test/CodeGen/Mips/llvm-ir/shl.ll | 122 +-
llvm/test/CodeGen/PowerPC/ctrloop-sh.ll | 54 +-
llvm/test/CodeGen/PowerPC/pr59074.ll | 11 +-
...lar-shift-by-byte-multiple-legalization.ll | 734 +-
.../PowerPC/wide-scalar-shift-legalization.ll | 272 +-
llvm/test/CodeGen/RISCV/shifts.ll | 52 +-
...lar-shift-by-byte-multiple-legalization.ll | 408 +-
.../RISCV/wide-scalar-shift-legalization.ll | 396 +-
.../X86/div-rem-pair-recomposition-signed.ll | 502 +-
.../div-rem-pair-recomposition-unsigned.ll | 447 +-
llvm/test/CodeGen/X86/pr38539.ll | 102 +-
.../CodeGen/X86/scheduler-backtracking.ll | 170 +-
llvm/test/CodeGen/X86/shift-i128.ll | 565 +-
llvm/test/CodeGen/X86/shift-i256.ll | 278 +-
...lar-shift-by-byte-multiple-legalization.ll | 21282 ++++++++++++++--
.../X86/wide-scalar-shift-legalization.ll | 4567 ++--
...ad-of-small-alloca-with-zero-upper-half.ll | 3575 ++-
.../CodeGen/X86/widen-load-of-small-alloca.ll | 1675 +-
23 files changed, 27988 insertions(+), 7746 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 015d3b412715b..a4dfb89828ebe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4535,26 +4535,12 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
LoadStoreVT = TLI.getTypeToTransformTo(*DAG.getContext(), LoadStoreVT);
} while (!TLI.isTypeLegal(LoadStoreVT));
- const unsigned KnownTrailingZeros =
- DAG.computeKnownBits(ShAmt).countMinTrailingZeros();
-
- const Align LoadStoreAlign = [&]() -> Align {
- unsigned IsFast = 0;
- const bool AllowsFastMisalignedMemoryAccesses =
- TLI.allowsMisalignedMemoryAccesses(
- LoadStoreVT, /*AddrSpace=*/DAG.getDataLayout().getAllocaAddrSpace(),
- /*Alignment=*/Align(LoadStoreVT.getStoreSize()),
- /*Flags=*/MachineMemOperand::MOLoad | MachineMemOperand::MOStore,
- &IsFast) &&
- IsFast;
- if (AllowsFastMisalignedMemoryAccesses && KnownTrailingZeros >= 3)
- return Align(1);
-
- return DAG.getReducedAlign(LoadStoreVT, /*UseABI=*/false);
- }();
-
- const unsigned ShiftUnitInBits = LoadStoreAlign.value() * 8;
- const bool IsOneStepShift = KnownTrailingZeros >= Log2_32(ShiftUnitInBits);
+ const unsigned ShiftUnitInBits = LoadStoreVT.getStoreSize() * 8;
+ assert(isPowerOf2_32(ShiftUnitInBits) &&
+ "Shifting unit is not a a power of two!");
+ const bool IsOneStepShift =
+ DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >=
+ Log2_32(ShiftUnitInBits);
// If we can't do it as one step, we'll have two uses of shift amount,
// and thus must freeze it.
@@ -4572,9 +4558,7 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
// Get a temporary stack slot 2x the width of our VT.
// FIXME: reuse stack slots?
- Align StackSlotAlignment(LoadStoreAlign);
- SDValue StackPtr = DAG.CreateStackTemporary(
- TypeSize::getFixed(StackSlotByteWidth), StackSlotAlignment);
+ SDValue StackPtr = DAG.CreateStackTemporary(StackSlotVT);
EVT PtrTy = StackPtr.getValueType();
SDValue Ch = DAG.getEntryNode();
@@ -4594,7 +4578,7 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
Init = DAG.getNode(ISD::BUILD_PAIR, dl, StackSlotVT, AllZeros, Shiftee);
}
// And spill it into the stack slot.
- Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo, StackSlotAlignment);
+ Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo);
// Now, compute the full-byte offset into stack slot from where we can load.
// We have shift amount, which is in bits. Offset should point to an aligned
@@ -4640,11 +4624,9 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, Offset, dl);
// And load it! While the load is not legal, legalizing it is obvious.
- SDValue Res =
- DAG.getLoad(VT, dl, Ch, AdjStackPtr,
- MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()),
- LoadStoreAlign);
- // We've performed the shift by a CHAR_BIT * [ShAmt / LoadAlign]
+ SDValue Res = DAG.getLoad(
+ VT, dl, Ch, AdjStackPtr,
+ MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
// If we may still have a remaining bits to shift by, do so now.
if (!IsOneStepShift) {
diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
index e21015ad3db30..4f46f7731e257 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -179,21 +179,36 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: lshr_32bytes:
; ALL: // %bb.0:
-; ALL-NEXT: sub sp, sp, #64
-; ALL-NEXT: ldp x9, x8, [x0, #16]
-; ALL-NEXT: movi v0.2d, #0000000000000000
+; ALL-NEXT: ldr q0, [x0]
+; ALL-NEXT: ldp x8, x9, [x0, #16]
; ALL-NEXT: ldr x10, [x1]
-; ALL-NEXT: ldr q1, [x0]
-; ALL-NEXT: stp x9, x8, [sp, #16]
+; ALL-NEXT: movi v1.2d, #0000000000000000
+; ALL-NEXT: str q0, [sp, #-64]!
+; ALL-NEXT: stp x8, x9, [sp, #16]
; ALL-NEXT: mov x8, sp
-; ALL-NEXT: and x9, x10, #0x1f
-; ALL-NEXT: str q1, [sp]
+; ALL-NEXT: and x9, x10, #0x18
+; ALL-NEXT: stp q1, q1, [sp, #32]
; ALL-NEXT: add x8, x8, x9
-; ALL-NEXT: stp q0, q0, [sp, #32]
-; ALL-NEXT: ldp x10, x9, [x8, #16]
-; ALL-NEXT: ldr q0, [x8]
-; ALL-NEXT: str q0, [x2]
-; ALL-NEXT: stp x10, x9, [x2, #16]
+; ALL-NEXT: lsl x9, x10, #3
+; ALL-NEXT: ldp x11, x10, [x8, #16]
+; ALL-NEXT: ldp x8, x12, [x8]
+; ALL-NEXT: mvn w13, w9
+; ALL-NEXT: and x9, x9, #0x38
+; ALL-NEXT: lsl x14, x10, #1
+; ALL-NEXT: lsl x15, x11, #1
+; ALL-NEXT: lsr x11, x11, x9
+; ALL-NEXT: lsl x16, x12, #1
+; ALL-NEXT: lsr x10, x10, x9
+; ALL-NEXT: lsr x12, x12, x9
+; ALL-NEXT: lsl x14, x14, x13
+; ALL-NEXT: lsr x8, x8, x9
+; ALL-NEXT: lsl x9, x16, x13
+; ALL-NEXT: lsl x13, x15, x13
+; ALL-NEXT: orr x11, x14, x11
+; ALL-NEXT: orr x8, x9, x8
+; ALL-NEXT: orr x9, x12, x13
+; ALL-NEXT: stp x11, x10, [x2, #16]
+; ALL-NEXT: stp x8, x9, [x2]
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
@@ -207,21 +222,37 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: shl_32bytes:
; ALL: // %bb.0:
; ALL-NEXT: sub sp, sp, #64
-; ALL-NEXT: ldp x9, x8, [x0, #16]
; ALL-NEXT: movi v0.2d, #0000000000000000
+; ALL-NEXT: ldp x8, x9, [x0, #16]
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q1, [x0]
-; ALL-NEXT: stp x9, x8, [sp, #48]
-; ALL-NEXT: mov x8, sp
-; ALL-NEXT: and x9, x10, #0x1f
-; ALL-NEXT: add x8, x8, #32
-; ALL-NEXT: stp q0, q0, [sp]
-; ALL-NEXT: str q1, [sp, #32]
-; ALL-NEXT: sub x8, x8, x9
-; ALL-NEXT: ldp x9, x10, [x8, #16]
-; ALL-NEXT: ldr q0, [x8]
-; ALL-NEXT: str q0, [x2]
-; ALL-NEXT: stp x9, x10, [x2, #16]
+; ALL-NEXT: mov x11, sp
+; ALL-NEXT: add x11, x11, #32
+; ALL-NEXT: and x12, x10, #0x18
+; ALL-NEXT: stp x8, x9, [sp, #48]
+; ALL-NEXT: lsl x9, x10, #3
+; ALL-NEXT: stp q0, q1, [sp, #16]
+; ALL-NEXT: sub x8, x11, x12
+; ALL-NEXT: str q0, [sp]
+; ALL-NEXT: mvn w13, w9
+; ALL-NEXT: and x9, x9, #0x38
+; ALL-NEXT: ldp x10, x11, [x8]
+; ALL-NEXT: ldp x12, x8, [x8, #16]
+; ALL-NEXT: lsr x14, x10, #1
+; ALL-NEXT: lsr x15, x11, #1
+; ALL-NEXT: lsl x11, x11, x9
+; ALL-NEXT: lsr x16, x12, #1
+; ALL-NEXT: lsl x10, x10, x9
+; ALL-NEXT: lsl x12, x12, x9
+; ALL-NEXT: lsr x14, x14, x13
+; ALL-NEXT: lsl x8, x8, x9
+; ALL-NEXT: lsr x9, x16, x13
+; ALL-NEXT: lsr x13, x15, x13
+; ALL-NEXT: orr x11, x11, x14
+; ALL-NEXT: orr x8, x8, x9
+; ALL-NEXT: orr x9, x12, x13
+; ALL-NEXT: stp x10, x11, [x2]
+; ALL-NEXT: stp x9, x8, [x2, #16]
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
@@ -234,22 +265,37 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: ashr_32bytes:
; ALL: // %bb.0:
-; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: ldr q0, [x0]
; ALL-NEXT: ldp x9, x8, [x0, #16]
; ALL-NEXT: ldr x10, [x1]
-; ALL-NEXT: ldr q0, [x0]
-; ALL-NEXT: and x10, x10, #0x1f
+; ALL-NEXT: str q0, [sp, #-64]!
+; ALL-NEXT: asr x11, x8, #63
; ALL-NEXT: stp x9, x8, [sp, #16]
-; ALL-NEXT: asr x8, x8, #63
-; ALL-NEXT: mov x9, sp
-; ALL-NEXT: str q0, [sp]
-; ALL-NEXT: stp x8, x8, [sp, #48]
-; ALL-NEXT: stp x8, x8, [sp, #32]
-; ALL-NEXT: add x8, x9, x10
-; ALL-NEXT: ldp x10, x9, [x8, #16]
-; ALL-NEXT: ldr q0, [x8]
-; ALL-NEXT: str q0, [x2]
-; ALL-NEXT: stp x10, x9, [x2, #16]
+; ALL-NEXT: mov x8, sp
+; ALL-NEXT: and x9, x10, #0x18
+; ALL-NEXT: stp x11, x11, [sp, #48]
+; ALL-NEXT: add x8, x8, x9
+; ALL-NEXT: lsl x9, x10, #3
+; ALL-NEXT: stp x11, x11, [sp, #32]
+; ALL-NEXT: ldp x11, x10, [x8, #16]
+; ALL-NEXT: mvn w13, w9
+; ALL-NEXT: ldp x8, x12, [x8]
+; ALL-NEXT: and x9, x9, #0x38
+; ALL-NEXT: lsl x14, x10, #1
+; ALL-NEXT: lsl x15, x11, #1
+; ALL-NEXT: lsr x11, x11, x9
+; ALL-NEXT: lsl x16, x12, #1
+; ALL-NEXT: asr x10, x10, x9
+; ALL-NEXT: lsr x12, x12, x9
+; ALL-NEXT: lsl x14, x14, x13
+; ALL-NEXT: lsr x8, x8, x9
+; ALL-NEXT: lsl x9, x16, x13
+; ALL-NEXT: lsl x13, x15, x13
+; ALL-NEXT: orr x11, x14, x11
+; ALL-NEXT: orr x8, x9, x8
+; ALL-NEXT: orr x9, x12, x13
+; ALL-NEXT: stp x11, x10, [x2, #16]
+; ALL-NEXT: stp x8, x9, [x2]
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
index 531e0fa740da7..faf3602791bbe 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
@@ -154,39 +154,38 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: lshr_32bytes:
; ALL: // %bb.0:
-; ALL-NEXT: sub sp, sp, #64
-; ALL-NEXT: ldp x9, x8, [x0, #16]
-; ALL-NEXT: movi v0.2d, #0000000000000000
-; ALL-NEXT: ldr x10, [x1]
-; ALL-NEXT: ldr q1, [x0]
-; ALL-NEXT: stp x9, x8, [sp, #16]
-; ALL-NEXT: lsr x8, x10, #3
-; ALL-NEXT: mov x9, sp
-; ALL-NEXT: str q1, [sp]
-; ALL-NEXT: and x12, x10, #0x3f
-; ALL-NEXT: and x8, x8, #0x18
-; ALL-NEXT: stp q0, q0, [sp, #32]
+; ALL-NEXT: ldr q0, [x0]
+; ALL-NEXT: ldp x8, x10, [x0, #16]
+; ALL-NEXT: ldr x9, [x1]
+; ALL-NEXT: movi v1.2d, #0000000000000000
+; ALL-NEXT: str q0, [sp, #-64]!
+; ALL-NEXT: lsr x11, x9, #3
+; ALL-NEXT: stp x8, x10, [sp, #16]
+; ALL-NEXT: mov x8, sp
+; ALL-NEXT: stp q1, q1, [sp, #32]
+; ALL-NEXT: and x12, x9, #0x3f
+; ALL-NEXT: and x10, x11, #0x18
; ALL-NEXT: eor x12, x12, #0x3f
-; ALL-NEXT: add x8, x9, x8
+; ALL-NEXT: add x8, x8, x10
; ALL-NEXT: ldp x13, x11, [x8]
-; ALL-NEXT: ldr x9, [x8, #24]
+; ALL-NEXT: ldr x10, [x8, #24]
; ALL-NEXT: ldr x8, [x8, #16]
-; ALL-NEXT: lsl x14, x9, #1
-; ALL-NEXT: lsr x9, x9, x10
+; ALL-NEXT: lsl x14, x10, #1
+; ALL-NEXT: lsr x10, x10, x9
; ALL-NEXT: lsl x15, x11, #1
-; ALL-NEXT: lsr x11, x11, x10
-; ALL-NEXT: lsr x13, x13, x10
+; ALL-NEXT: lsr x11, x11, x9
+; ALL-NEXT: lsr x13, x13, x9
; ALL-NEXT: lsl x14, x14, x12
; ALL-NEXT: lsl x12, x15, x12
; ALL-NEXT: lsl x15, x8, #1
-; ALL-NEXT: lsr x8, x8, x10
-; ALL-NEXT: mvn w10, w10
-; ALL-NEXT: lsl x10, x15, x10
+; ALL-NEXT: lsr x8, x8, x9
+; ALL-NEXT: mvn w9, w9
+; ALL-NEXT: lsl x9, x15, x9
; ALL-NEXT: orr x8, x14, x8
-; ALL-NEXT: stp x8, x9, [x2, #16]
-; ALL-NEXT: orr x9, x12, x13
-; ALL-NEXT: orr x8, x11, x10
-; ALL-NEXT: stp x9, x8, [x2]
+; ALL-NEXT: stp x8, x10, [x2, #16]
+; ALL-NEXT: orr x10, x12, x13
+; ALL-NEXT: orr x8, x11, x9
+; ALL-NEXT: stp x10, x8, [x2]
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
@@ -199,39 +198,39 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: shl_32bytes:
; ALL: // %bb.0:
; ALL-NEXT: sub sp, sp, #64
-; ALL-NEXT: ldp x9, x8, [x0, #16]
-; ALL-NEXT: movi v0.2d, #0000000000000000
-; ALL-NEXT: ldr x10, [x1]
-; ALL-NEXT: ldr q1, [x0]
-; ALL-NEXT: stp x9, x8, [sp, #48]
-; ALL-NEXT: lsr x8, x10, #3
-; ALL-NEXT: mov x9, sp
-; ALL-NEXT: add x9, x9, #32
-; ALL-NEXT: stp q0, q1, [sp, #16]
-; ALL-NEXT: and x12, x10, #0x3f
-; ALL-NEXT: and x8, x8, #0x18
-; ALL-NEXT: str q0, [sp]
+; ALL-NEXT: ldp x10, x8, [x0, #16]
+; ALL-NEXT: movi v1.2d, #0000000000000000
+; ALL-NEXT: ldr x9, [x1]
+; ALL-NEXT: ldr q0, [x0]
+; ALL-NEXT: lsr x11, x9, #3
+; ALL-NEXT: stp x10, x8, [sp, #48]
+; ALL-NEXT: mov x8, sp
+; ALL-NEXT: add x8, x8, #32
+; ALL-NEXT: stp q1, q0, [sp, #16]
+; ALL-NEXT: and x12, x9, #0x3f
+; ALL-NEXT: and x10, x11, #0x18
+; ALL-NEXT: str q1, [sp]
; ALL-NEXT: eor x12, x12, #0x3f
-; ALL-NEXT: sub x8, x9, x8
+; ALL-NEXT: sub x8, x8, x10
; ALL-NEXT: ldp x11, x13, [x8, #16]
-; ALL-NEXT: ldr x9, [x8]
+; ALL-NEXT: ldr x10, [x8]
; ALL-NEXT: ldr x8, [x8, #8]
-; ALL-NEXT: lsr x15, x9, #1
-; ALL-NEXT: lsl x9, x9, x10
+; ALL-NEXT: lsr x15, x10, #1
+; ALL-NEXT: lsl x10, x10, x9
; ALL-NEXT: lsr x14, x11, #1
-; ALL-NEXT: lsl x11, x11, x10
-; ALL-NEXT: lsl x13, x13, x10
+; ALL-NEXT: lsl x11, x11, x9
+; ALL-NEXT: lsl x13, x13, x9
; ALL-NEXT: lsr x14, x14, x12
; ALL-NEXT: lsr x12, x15, x12
; ALL-NEXT: lsr x15, x8, #1
-; ALL-NEXT: lsl x8, x8, x10
-; ALL-NEXT: mvn w10, w10
-; ALL-NEXT: lsr x10, x15, x10
+; ALL-NEXT: lsl x8, x8, x9
+; ALL-NEXT: mvn w9, w9
+; ALL-NEXT: lsr x9, x15, x9
; ALL-NEXT: orr x8, x8, x12
-; ALL-NEXT: stp x9, x8, [x2]
-; ALL-NEXT: orr x9, x13, x14
-; ALL-NEXT: orr x8, x11, x10
-; ALL-NEXT: stp x8, x9, [x2, #16]
+; ALL-NEXT: stp x10, x8, [x2]
+; ALL-NEXT: orr x10, x13, x14
+; ALL-NEXT: orr x8, x11, x9
+; ALL-NEXT: stp x8, x10, [x2, #16]
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
@@ -243,21 +242,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: ashr_32bytes:
; ALL: // %bb.0:
-; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: ldr q0, [x0]
; ALL-NEXT: ldp x9, x8, [x0, #16]
-; ALL-NEXT: mov x11, sp
; ALL-NEXT: ldr x10, [x1]
-; ALL-NEXT: ldr q0, [x0]
+; ALL-NEXT: str q0, [sp, #-64]!
+; ALL-NEXT: asr x11, x8, #63
; ALL-NEXT: stp x9, x8, [sp, #16]
-; ALL-NEXT: lsr x9, x10, #3
-; ALL-NEXT: asr x8, x8, #63
-; ALL-NEXT: str q0, [sp]
+; ALL-NEXT: lsr x8, x10, #3
+; ALL-NEXT: mov x9, sp
; ALL-NEXT: and x12, x10, #0x3f
-; ALL-NEXT: and x9, x9, #0x18
-; ALL-NEXT: stp x8, x8, [sp, #48]
+; ALL-NEXT: and x8, x8, #0x18
+; ALL-NEXT: stp x11, x11, [sp, #48]
; ALL-NEXT: eor x12, x12, #0x3f
-; ALL-NEXT: stp x8, x8, [sp, #32]
-; ALL-NEXT: add x8, x11, x9
+; ALL-NEXT: stp x11, x11, [sp, #32]
+; ALL-NEXT: add x8, x9, x8
; ALL-NEXT: ldp x13, x11, [x8]
; ALL-NEXT: ldr x9, [x8, #24]
; ALL-NEXT: ldr x8, [x8, #16]
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
index 6db3fb930b94e..1a29b57986325 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -384,15 +384,15 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MIPS-NEXT: .cfi_def_cfa_offset 32
; MIPS-NEXT: sra $1, $4, 31
; MIPS-NEXT: sw $7, 28($sp)
-; MIPS-NEXT: sw $6, 24($sp)
; MIPS-NEXT: sw $5, 20($sp)
-; MIPS-NEXT: sw $4, 16($sp)
; MIPS-NEXT: sw $1, 12($sp)
-; MIPS-NEXT: sw $1, 8($sp)
; MIPS-NEXT: sw $1, 4($sp)
+; MIPS-NEXT: addiu $2, $sp, 0
+; MIPS-NEXT: sw $6, 24($sp)
+; MIPS-NEXT: sw $4, 16($sp)
+; MIPS-NEXT: sw $1, 8($sp)
; MIPS-NEXT: sw $1, 0($sp)
-; MIPS-NEXT: addiu $1, $sp, 0
-; MIPS-NEXT: addiu $1, $1, 16
+; MIPS-NEXT: addiu $1, $2, 16
; MIPS-NEXT: lw $2, 60($sp)
; MIPS-NEXT: srl $3, $2, 3
; MIPS-NEXT: andi $3, $3, 12
@@ -425,15 +425,15 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MIPS32-NEXT: .cfi_def_cfa_offset 32
; MIPS32-NEXT: sra $1, $4, 31
; MIPS32-NEXT: sw $7, 28($sp)
-; MIPS32-NEXT: sw $6, 24($sp)
; MIPS32-NEXT: sw $5, 20($sp)
-; MIPS32-NEXT: sw $4, 16($sp)
; MIPS32-NEXT: sw $1, 12($sp)
-; MIPS32-NEXT: sw $1, 8($sp)
; MIPS32-NEXT: sw $1, 4($sp)
+; MIPS32-NEXT: addiu $2, $sp, 0
+; MIPS32-NEXT: sw $6, 24($sp)
+; MIPS32-NEXT: sw $4, 16($sp)
+; MIPS32-NEXT: sw $1, 8($sp)
; MIPS32-NEXT: sw $1, 0($sp)
-; MIPS32-NEXT: addiu $1, $sp, 0
-; MIPS32-NEXT: addiu $1, $1, 16
+; MIPS32-NEXT: addiu $1, $2, 16
; MIPS32-NEXT: lw $2, 60($sp)
; MIPS32-NEXT: srl $3, $2, 3
; MIPS32-NEXT: andi $3, $3, 12
@@ -466,15 +466,15 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; 32R2-NEXT: .cfi_def_cfa_offset 32
; 32R2-NEXT: sra $1, $4, 31
; 32R2-NEXT: sw $7, 28($sp)
-; 32R2-NEXT: sw $6, 24($sp)
; 32R2-NEXT: sw $5, 20($sp)
-; 32R2-NEXT: sw $4, 16($sp)
; 32R2-NEXT: sw $1, 12($sp)
-; 32R2-NEXT: sw $1, 8($sp)
; 32R2-NEXT: sw $1, 4($sp)
+; 32R2-NEXT: addiu $2, $sp, 0
+; 32R2-NEXT: sw $6, 24($sp)
+; 32R2-NEXT: sw $4, 16($sp)
+; 32R2-NEXT: sw $1, 8($sp)
; 32R2-NEXT: sw $1, 0($sp)
-; 32R2-NEXT: addiu $1, $sp, 0
-; 32R2-NEXT: addiu $1, $1, 16
+; 32R2-NEXT: addiu $1, $2, 16
; 32R2-NEXT: lw $2, 60($sp)
; 32R2-NEXT: srl $3, $2, 3
; 32R2-NEXT: andi $3, $3, 12
@@ -507,15 +507,15 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; 32R6-NEXT: .cfi_def_cfa_offset 32
; 32R6-NEXT: sra $1, $4, 31
; 32R6-NEXT: sw $7, 28($sp)
-; 32R6-NEXT: sw $6, 24($sp)
; 32R6-NEXT: sw $5, 20($sp)
-; 32R6-NEXT: sw $4, 16($sp)
; 32R6-NEXT: sw $1, 12($sp)
-; 32R6-NEXT: sw $1, 8($sp)
; 32R6-NEXT: sw $1, 4($sp)
+; 32R6-NEXT: addiu $2, $sp, 0
+; 32R6-NEXT: sw $6, 24($sp)
+; 32R6-NEXT: sw $4, 16($sp)
+; 32R6-NEXT: sw $1, 8($sp)
; 32R6-NEXT: sw $1, 0($sp)
-; 32R6-NEXT: addiu $1, $sp, 0
-; 32R6-NEXT: addiu $1, $1, 16
+; 32R6-NEXT: addiu $1, $2, 16
; 32R6-NEXT: lw $2, 60($sp)
; 32R6-NEXT: srl $3, $2, 3
; 32R6-NEXT: andi $3, $3, 12
@@ -619,13 +619,15 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MMR3-NEXT: .cfi_offset 17, -4
; MMR3-NEXT: .cfi_offset 16, -8
; MMR3-NEXT: sra $1, $4, 31
-; MMR3-NEXT: swp $6, 24($sp)
-; MMR3-NEXT: swp $4, 16($sp)
+; MMR3-NEXT: sw $7, 28($sp)
+; MMR3-NEXT: sw $5, 20($sp)
; MMR3-NEXT: sw $1, 12($sp)
-; MMR3-NEXT: sw $1, 8($sp)
; MMR3-NEXT: sw $1, 4($sp)
-; MMR3-NEXT: sw $1, 0($sp)
; MMR3-NEXT: addiur1sp $2, 0
+; MMR3-NEXT: sw $6, 24($sp)
+; MMR3-NEXT: sw $4, 16($sp)
+; MMR3-NEXT: sw $1, 8($sp)
+; MMR3-NEXT: sw $1, 0($sp)
; MMR3-NEXT: addiur2 $2, $2, 16
; MMR3-NEXT: lw $3, 68($sp)
; MMR3-NEXT: srl16 $4, $3, 3
@@ -660,15 +662,15 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill
; MMR6-NEXT: .cfi_offset 16, -4
; MMR6-NEXT: sra $1, $4, 31
-; MMR6-NEXT: sw $7, 32($sp)
-; MMR6-NEXT: sw $6, 28($sp)
-; MMR6-NEXT: sw $5, 24($sp)
-; MMR6-NEXT: sw $4, 20($sp)
-; MMR6-NEXT: sw $1, 16($sp)
+; MMR6-NEXT: sw $7, 28($sp)
+; MMR6-NEXT: sw $5, 20($sp)
; MMR6-NEXT: sw $1, 12($sp)
-; MMR6-NEXT: sw $1, 8($sp)
; MMR6-NEXT: sw $1, 4($sp)
-; MMR6-NEXT: addiu $2, $sp, 4
+; MMR6-NEXT: addiu $2, $sp, 0
+; MMR6-NEXT: sw $6, 24($sp)
+; MMR6-NEXT: sw $4, 16($sp)
+; MMR6-NEXT: sw $1, 8($sp)
+; MMR6-NEXT: sw $1, 0($sp)
; MMR6-NEXT: addiur2 $2, $2, 16
; MMR6-NEXT: lw $3, 68($sp)
; MMR6-NEXT: srl16 $4, $3, 3
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
index fa10293c0f6fb..7db14fc506e79 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -398,39 +398,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS2: # %bb.0: # %entry
; MIPS2-NEXT: addiu $sp, $sp, -32
; MIPS2-NEXT: .cfi_def_cfa_offset 32
-; MIPS2-NEXT: addiu $1, $sp, 0
+; MIPS2-NEXT: lw $1, 60($sp)
+; MIPS2-NEXT: addiu $2, $sp, 0
; MIPS2-NEXT: sw $7, 28($sp)
-; MIPS2-NEXT: sw $6, 24($sp)
; MIPS2-NEXT: sw $5, 20($sp)
-; MIPS2-NEXT: sw $4, 16($sp)
-; MIPS2-NEXT: addiu $1, $1, 16
-; MIPS2-NEXT: lw $2, 60($sp)
-; MIPS2-NEXT: srl $3, $2, 3
+; MIPS2-NEXT: addiu $2, $2, 16
+; MIPS2-NEXT: srl $3, $1, 3
; MIPS2-NEXT: andi $3, $3, 12
-; MIPS2-NEXT: subu $1, $1, $3
+; MIPS2-NEXT: sw $6, 24($sp)
+; MIPS2-NEXT: sw $4, 16($sp)
+; MIPS2-NEXT: subu $2, $2, $3
; MIPS2-NEXT: sw $zero, 12($sp)
-; MIPS2-NEXT: sw $zero, 8($sp)
; MIPS2-NEXT: sw $zero, 4($sp)
+; MIPS2-NEXT: lw $3, 4($2)
+; MIPS2-NEXT: sw $zero, 8($sp)
; MIPS2-NEXT: sw $zero, 0($sp)
-; MIPS2-NEXT: lw $3, 4($1)
-; MIPS2-NEXT: lw $5, 8($1)
-; MIPS2-NEXT: srlv $4, $5, $2
+; MIPS2-NEXT: lw $5, 8($2)
+; MIPS2-NEXT: srlv $4, $5, $1
; MIPS2-NEXT: sll $6, $3, 1
-; MIPS2-NEXT: andi $7, $2, 31
+; MIPS2-NEXT: andi $7, $1, 31
; MIPS2-NEXT: xori $7, $7, 31
; MIPS2-NEXT: sllv $6, $6, $7
-; MIPS2-NEXT: srlv $3, $3, $2
-; MIPS2-NEXT: lw $8, 0($1)
+; MIPS2-NEXT: srlv $3, $3, $1
+; MIPS2-NEXT: lw $8, 0($2)
; MIPS2-NEXT: sll $9, $8, 1
; MIPS2-NEXT: sllv $9, $9, $7
; MIPS2-NEXT: or $3, $3, $9
; MIPS2-NEXT: or $4, $4, $6
-; MIPS2-NEXT: lw $1, 12($1)
-; MIPS2-NEXT: srlv $1, $1, $2
+; MIPS2-NEXT: lw $2, 12($2)
+; MIPS2-NEXT: srlv $2, $2, $1
; MIPS2-NEXT: sll $5, $5, 1
; MIPS2-NEXT: sllv $5, $5, $7
-; MIPS2-NEXT: or $5, $1, $5
-; MIPS2-NEXT: srlv $2, $8, $2
+; MIPS2-NEXT: or $5, $2, $5
+; MIPS2-NEXT: srlv $2, $8, $1
; MIPS2-NEXT: jr $ra
; MIPS2-NEXT: addiu $sp, $sp, 32
;
@@ -438,39 +438,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS32: # %bb.0: # %entry
; MIPS32-NEXT: addiu $sp, $sp, -32
; MIPS32-NEXT: .cfi_def_cfa_offset 32
-; MIPS32-NEXT: addiu $1, $sp, 0
+; MIPS32-NEXT: lw $1, 60($sp)
+; MIPS32-NEXT: addiu $2, $sp, 0
; MIPS32-NEXT: sw $7, 28($sp)
-; MIPS32-NEXT: sw $6, 24($sp)
; MIPS32-NEXT: sw $5, 20($sp)
-; MIPS32-NEXT: sw $4, 16($sp)
-; MIPS32-NEXT: addiu $1, $1, 16
-; MIPS32-NEXT: lw $2, 60($sp)
-; MIPS32-NEXT: srl $3, $2, 3
+; MIPS32-NEXT: addiu $2, $2, 16
+; MIPS32-NEXT: srl $3, $1, 3
; MIPS32-NEXT: andi $3, $3, 12
-; MIPS32-NEXT: subu $1, $1, $3
+; MIPS32-NEXT: sw $6, 24($sp)
+; MIPS32-NEXT: sw $4, 16($sp)
+; MIPS32-NEXT: subu $2, $2, $3
; MIPS32-NEXT: sw $zero, 12($sp)
-; MIPS32-NEXT: sw $zero, 8($sp)
; MIPS32-NEXT: sw $zero, 4($sp)
+; MIPS32-NEXT: lw $3, 4($2)
+; MIPS32-NEXT: sw $zero, 8($sp)
; MIPS32-NEXT: sw $zero, 0($sp)
-; MIPS32-NEXT: lw $3, 4($1)
-; MIPS32-NEXT: lw $5, 8($1)
-; MIPS32-NEXT: srlv $4, $5, $2
+; MIPS32-NEXT: lw $5, 8($2)
+; MIPS32-NEXT: srlv $4, $5, $1
; MIPS32-NEXT: sll $6, $3, 1
-; MIPS32-NEXT: andi $7, $2, 31
+; MIPS32-NEXT: andi $7, $1, 31
; MIPS32-NEXT: xori $7, $7, 31
; MIPS32-NEXT: sllv $6, $6, $7
-; MIPS32-NEXT: srlv $3, $3, $2
-; MIPS32-NEXT: lw $8, 0($1)
+; MIPS32-NEXT: srlv $3, $3, $1
+; MIPS32-NEXT: lw $8, 0($2)
; MIPS32-NEXT: sll $9, $8, 1
; MIPS32-NEXT: sllv $9, $9, $7
; MIPS32-NEXT: or $3, $3, $9
; MIPS32-NEXT: or $4, $4, $6
-; MIPS32-NEXT: lw $1, 12($1)
-; MIPS32-NEXT: srlv $1, $1, $2
+; MIPS32-NEXT: lw $2, 12($2)
+; MIPS32-NEXT: srlv $2, $2, $1
; MIPS32-NEXT: sll $5, $5, 1
; MIPS32-NEXT: sllv $5, $5, $7
-; MIPS32-NEXT: or $5, $1, $5
-; MIPS32-NEXT: srlv $2, $8, $2
+; MIPS32-NEXT: or $5, $2, $5
+; MIPS32-NEXT: srlv $2, $8, $1
; MIPS32-NEXT: jr $ra
; MIPS32-NEXT: addiu $sp, $sp, 32
;
@@ -478,39 +478,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS32R2: # %bb.0: # %entry
; MIPS32R2-NEXT: addiu $sp, $sp, -32
; MIPS32R2-NEXT: .cfi_def_cfa_offset 32
-; MIPS32R2-NEXT: addiu $1, $sp, 0
+; MIPS32R2-NEXT: lw $1, 60($sp)
+; MIPS32R2-NEXT: addiu $2, $sp, 0
; MIPS32R2-NEXT: sw $7, 28($sp)
-; MIPS32R2-NEXT: sw $6, 24($sp)
; MIPS32R2-NEXT: sw $5, 20($sp)
-; MIPS32R2-NEXT: sw $4, 16($sp)
-; MIPS32R2-NEXT: addiu $1, $1, 16
-; MIPS32R2-NEXT: lw $2, 60($sp)
-; MIPS32R2-NEXT: srl $3, $2, 3
+; MIPS32R2-NEXT: addiu $2, $2, 16
+; MIPS32R2-NEXT: srl $3, $1, 3
; MIPS32R2-NEXT: andi $3, $3, 12
-; MIPS32R2-NEXT: subu $1, $1, $3
+; MIPS32R2-NEXT: sw $6, 24($sp)
+; MIPS32R2-NEXT: sw $4, 16($sp)
+; MIPS32R2-NEXT: subu $2, $2, $3
; MIPS32R2-NEXT: sw $zero, 12($sp)
-; MIPS32R2-NEXT: sw $zero, 8($sp)
; MIPS32R2-NEXT: sw $zero, 4($sp)
+; MIPS32R2-NEXT: lw $3, 4($2)
+; MIPS32R2-NEXT: sw $zero, 8($sp)
; MIPS32R2-NEXT: sw $zero, 0($sp)
-; MIPS32R2-NEXT: lw $3, 4($1)
-; MIPS32R2-NEXT: lw $5, 8($1)
-; MIPS32R2-NEXT: srlv $4, $5, $2
+; MIPS32R2-NEXT: lw $5, 8($2)
+; MIPS32R2-NEXT: srlv $4, $5, $1
; MIPS32R2-NEXT: sll $6, $3, 1
-; MIPS32R2-NEXT: andi $7, $2, 31
+; MIPS32R2-NEXT: andi $7, $1, 31
; MIPS32R2-NEXT: xori $7, $7, 31
; MIPS32R2-NEXT: sllv $6, $6, $7
-; MIPS32R2-NEXT: srlv $3, $3, $2
-; MIPS32R2-NEXT: lw $8, 0($1)
+; MIPS32R2-NEXT: srlv $3, $3, $1
+; MIPS32R2-NEXT: lw $8, 0($2)
; MIPS32R2-NEXT: sll $9, $8, 1
; MIPS32R2-NEXT: sllv $9, $9, $7
; MIPS32R2-NEXT: or $3, $3, $9
; MIPS32R2-NEXT: or $4, $4, $6
-; MIPS32R2-NEXT: lw $1, 12($1)
-; MIPS32R2-NEXT: srlv $1, $1, $2
+; MIPS32R2-NEXT: lw $2, 12($2)
+; MIPS32R2-NEXT: srlv $2, $2, $1
; MIPS32R2-NEXT: sll $5, $5, 1
; MIPS32R2-NEXT: sllv $5, $5, $7
-; MIPS32R2-NEXT: or $5, $1, $5
-; MIPS32R2-NEXT: srlv $2, $8, $2
+; MIPS32R2-NEXT: or $5, $2, $5
+; MIPS32R2-NEXT: srlv $2, $8, $1
; MIPS32R2-NEXT: jr $ra
; MIPS32R2-NEXT: addiu $sp, $sp, 32
;
@@ -518,39 +518,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS32R6: # %bb.0: # %entry
; MIPS32R6-NEXT: addiu $sp, $sp, -32
; MIPS32R6-NEXT: .cfi_def_cfa_offset 32
-; MIPS32R6-NEXT: addiu $1, $sp, 0
+; MIPS32R6-NEXT: lw $1, 60($sp)
+; MIPS32R6-NEXT: addiu $2, $sp, 0
; MIPS32R6-NEXT: sw $7, 28($sp)
-; MIPS32R6-NEXT: sw $6, 24($sp)
; MIPS32R6-NEXT: sw $5, 20($sp)
-; MIPS32R6-NEXT: sw $4, 16($sp)
-; MIPS32R6-NEXT: addiu $1, $1, 16
-; MIPS32R6-NEXT: lw $2, 60($sp)
-; MIPS32R6-NEXT: srl $3, $2, 3
+; MIPS32R6-NEXT: addiu $2, $2, 16
+; MIPS32R6-NEXT: srl $3, $1, 3
; MIPS32R6-NEXT: andi $3, $3, 12
-; MIPS32R6-NEXT: subu $1, $1, $3
+; MIPS32R6-NEXT: sw $6, 24($sp)
+; MIPS32R6-NEXT: sw $4, 16($sp)
+; MIPS32R6-NEXT: subu $2, $2, $3
; MIPS32R6-NEXT: sw $zero, 12($sp)
-; MIPS32R6-NEXT: sw $zero, 8($sp)
; MIPS32R6-NEXT: sw $zero, 4($sp)
+; MIPS32R6-NEXT: lw $3, 4($2)
+; MIPS32R6-NEXT: sw $zero, 8($sp)
; MIPS32R6-NEXT: sw $zero, 0($sp)
-; MIPS32R6-NEXT: lw $3, 4($1)
-; MIPS32R6-NEXT: lw $5, 8($1)
-; MIPS32R6-NEXT: srlv $4, $5, $2
+; MIPS32R6-NEXT: lw $5, 8($2)
+; MIPS32R6-NEXT: srlv $4, $5, $1
; MIPS32R6-NEXT: sll $6, $3, 1
-; MIPS32R6-NEXT: andi $7, $2, 31
+; MIPS32R6-NEXT: andi $7, $1, 31
; MIPS32R6-NEXT: xori $7, $7, 31
; MIPS32R6-NEXT: sllv $6, $6, $7
-; MIPS32R6-NEXT: srlv $3, $3, $2
-; MIPS32R6-NEXT: lw $8, 0($1)
+; MIPS32R6-NEXT: srlv $3, $3, $1
+; MIPS32R6-NEXT: lw $8, 0($2)
; MIPS32R6-NEXT: sll $9, $8, 1
; MIPS32R6-NEXT: sllv $9, $9, $7
; MIPS32R6-NEXT: or $3, $3, $9
; MIPS32R6-NEXT: or $4, $4, $6
-; MIPS32R6-NEXT: lw $1, 12($1)
-; MIPS32R6-NEXT: srlv $1, $1, $2
+; MIPS32R6-NEXT: lw $2, 12($2)
+; MIPS32R6-NEXT: srlv $2, $2, $1
; MIPS32R6-NEXT: sll $5, $5, 1
; MIPS32R6-NEXT: sllv $5, $5, $7
-; MIPS32R6-NEXT: or $5, $1, $5
-; MIPS32R6-NEXT: srlv $2, $8, $2
+; MIPS32R6-NEXT: or $5, $2, $5
+; MIPS32R6-NEXT: srlv $2, $8, $1
; MIPS32R6-NEXT: jr $ra
; MIPS32R6-NEXT: addiu $sp, $sp, 32
;
@@ -639,14 +639,16 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MMR3-NEXT: swp $16, 32($sp)
; MMR3-NEXT: .cfi_offset 17, -4
; MMR3-NEXT: .cfi_offset 16, -8
-; MMR3-NEXT: li16 $2, 0
-; MMR3-NEXT: swp $6, 24($sp)
-; MMR3-NEXT: swp $4, 16($sp)
-; MMR3-NEXT: sw $2, 12($sp)
-; MMR3-NEXT: sw $2, 8($sp)
-; MMR3-NEXT: sw $2, 4($sp)
-; MMR3-NEXT: sw $2, 0($sp)
; MMR3-NEXT: addiur1sp $2, 0
+; MMR3-NEXT: li16 $3, 0
+; MMR3-NEXT: sw $7, 28($sp)
+; MMR3-NEXT: sw $5, 20($sp)
+; MMR3-NEXT: sw $3, 12($sp)
+; MMR3-NEXT: sw $3, 4($sp)
+; MMR3-NEXT: sw $6, 24($sp)
+; MMR3-NEXT: sw $4, 16($sp)
+; MMR3-NEXT: sw $3, 8($sp)
+; MMR3-NEXT: sw $3, 0($sp)
; MMR3-NEXT: addiur2 $2, $2, 16
; MMR3-NEXT: lw $3, 68($sp)
; MMR3-NEXT: srl16 $4, $3, 3
@@ -680,16 +682,16 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MMR6-NEXT: .cfi_def_cfa_offset 40
; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill
; MMR6-NEXT: .cfi_offset 16, -4
-; MMR6-NEXT: li16 $2, 0
-; MMR6-NEXT: sw $7, 32($sp)
-; MMR6-NEXT: sw $6, 28($sp)
-; MMR6-NEXT: sw $5, 24($sp)
-; MMR6-NEXT: sw $4, 20($sp)
-; MMR6-NEXT: sw $2, 16($sp)
-; MMR6-NEXT: sw $2, 12($sp)
-; MMR6-NEXT: sw $2, 8($sp)
-; MMR6-NEXT: sw $2, 4($sp)
-; MMR6-NEXT: addiu $2, $sp, 4
+; MMR6-NEXT: addiu $2, $sp, 0
+; MMR6-NEXT: li16 $3, 0
+; MMR6-NEXT: sw $7, 28($sp)
+; MMR6-NEXT: sw $5, 20($sp)
+; MMR6-NEXT: sw $3, 12($sp)
+; MMR6-NEXT: sw $3, 4($sp)
+; MMR6-NEXT: sw $6, 24($sp)
+; MMR6-NEXT: sw $4, 16($sp)
+; MMR6-NEXT: sw $3, 8($sp)
+; MMR6-NEXT: sw $3, 0($sp)
; MMR6-NEXT: addiur2 $2, $2, 16
; MMR6-NEXT: lw $3, 68($sp)
; MMR6-NEXT: srl16 $4, $3, 3
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
index 394890a9dcc7c..5320f6d8a4353 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -442,18 +442,18 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS2-NEXT: .cfi_def_cfa_offset 32
; MIPS2-NEXT: lw $1, 60($sp)
; MIPS2-NEXT: srl $2, $1, 3
-; MIPS2-NEXT: sw $7, 12($sp)
; MIPS2-NEXT: sw $6, 8($sp)
-; MIPS2-NEXT: sw $5, 4($sp)
; MIPS2-NEXT: sw $4, 0($sp)
; MIPS2-NEXT: andi $2, $2, 12
; MIPS2-NEXT: addiu $3, $sp, 0
+; MIPS2-NEXT: sw $7, 12($sp)
+; MIPS2-NEXT: sw $5, 4($sp)
; MIPS2-NEXT: addu $4, $3, $2
-; MIPS2-NEXT: sw $zero, 28($sp)
; MIPS2-NEXT: sw $zero, 24($sp)
-; MIPS2-NEXT: sw $zero, 20($sp)
; MIPS2-NEXT: sw $zero, 16($sp)
; MIPS2-NEXT: lw $5, 8($4)
+; MIPS2-NEXT: sw $zero, 28($sp)
+; MIPS2-NEXT: sw $zero, 20($sp)
; MIPS2-NEXT: lw $2, 4($4)
; MIPS2-NEXT: sllv $3, $2, $1
; MIPS2-NEXT: srl $6, $5, 1
@@ -481,18 +481,18 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32-NEXT: .cfi_def_cfa_offset 32
; MIPS32-NEXT: lw $1, 60($sp)
; MIPS32-NEXT: srl $2, $1, 3
-; MIPS32-NEXT: sw $7, 12($sp)
; MIPS32-NEXT: sw $6, 8($sp)
-; MIPS32-NEXT: sw $5, 4($sp)
; MIPS32-NEXT: sw $4, 0($sp)
; MIPS32-NEXT: andi $2, $2, 12
; MIPS32-NEXT: addiu $3, $sp, 0
+; MIPS32-NEXT: sw $7, 12($sp)
+; MIPS32-NEXT: sw $5, 4($sp)
; MIPS32-NEXT: addu $4, $3, $2
-; MIPS32-NEXT: sw $zero, 28($sp)
; MIPS32-NEXT: sw $zero, 24($sp)
-; MIPS32-NEXT: sw $zero, 20($sp)
; MIPS32-NEXT: sw $zero, 16($sp)
; MIPS32-NEXT: lw $5, 8($4)
+; MIPS32-NEXT: sw $zero, 28($sp)
+; MIPS32-NEXT: sw $zero, 20($sp)
; MIPS32-NEXT: lw $2, 4($4)
; MIPS32-NEXT: sllv $3, $2, $1
; MIPS32-NEXT: srl $6, $5, 1
@@ -520,18 +520,18 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32R2-NEXT: .cfi_def_cfa_offset 32
; MIPS32R2-NEXT: lw $1, 60($sp)
; MIPS32R2-NEXT: srl $2, $1, 3
-; MIPS32R2-NEXT: sw $7, 12($sp)
; MIPS32R2-NEXT: sw $6, 8($sp)
-; MIPS32R2-NEXT: sw $5, 4($sp)
; MIPS32R2-NEXT: sw $4, 0($sp)
; MIPS32R2-NEXT: andi $2, $2, 12
; MIPS32R2-NEXT: addiu $3, $sp, 0
+; MIPS32R2-NEXT: sw $7, 12($sp)
+; MIPS32R2-NEXT: sw $5, 4($sp)
; MIPS32R2-NEXT: addu $4, $3, $2
-; MIPS32R2-NEXT: sw $zero, 28($sp)
; MIPS32R2-NEXT: sw $zero, 24($sp)
-; MIPS32R2-NEXT: sw $zero, 20($sp)
; MIPS32R2-NEXT: sw $zero, 16($sp)
; MIPS32R2-NEXT: lw $5, 8($4)
+; MIPS32R2-NEXT: sw $zero, 28($sp)
+; MIPS32R2-NEXT: sw $zero, 20($sp)
; MIPS32R2-NEXT: lw $2, 4($4)
; MIPS32R2-NEXT: sllv $3, $2, $1
; MIPS32R2-NEXT: srl $6, $5, 1
@@ -559,18 +559,18 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32R6-NEXT: .cfi_def_cfa_offset 32
; MIPS32R6-NEXT: lw $1, 60($sp)
; MIPS32R6-NEXT: srl $2, $1, 3
-; MIPS32R6-NEXT: sw $7, 12($sp)
; MIPS32R6-NEXT: sw $6, 8($sp)
-; MIPS32R6-NEXT: sw $5, 4($sp)
; MIPS32R6-NEXT: sw $4, 0($sp)
; MIPS32R6-NEXT: andi $2, $2, 12
; MIPS32R6-NEXT: addiu $3, $sp, 0
+; MIPS32R6-NEXT: sw $7, 12($sp)
+; MIPS32R6-NEXT: sw $5, 4($sp)
; MIPS32R6-NEXT: addu $4, $3, $2
-; MIPS32R6-NEXT: sw $zero, 28($sp)
; MIPS32R6-NEXT: sw $zero, 24($sp)
-; MIPS32R6-NEXT: sw $zero, 20($sp)
; MIPS32R6-NEXT: sw $zero, 16($sp)
; MIPS32R6-NEXT: lw $5, 8($4)
+; MIPS32R6-NEXT: sw $zero, 28($sp)
+; MIPS32R6-NEXT: sw $zero, 20($sp)
; MIPS32R6-NEXT: lw $2, 4($4)
; MIPS32R6-NEXT: sllv $3, $2, $1
; MIPS32R6-NEXT: srl $6, $5, 1
@@ -680,82 +680,86 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MMR3: # %bb.0: # %entry
; MMR3-NEXT: addiusp -40
; MMR3-NEXT: .cfi_def_cfa_offset 40
-; MMR3-NEXT: swp $16, 32($sp)
-; MMR3-NEXT: .cfi_offset 17, -4
-; MMR3-NEXT: .cfi_offset 16, -8
+; MMR3-NEXT: sw $16, 36($sp) # 4-byte Folded Spill
+; MMR3-NEXT: .cfi_offset 16, -4
; MMR3-NEXT: li16 $2, 0
-; MMR3-NEXT: sw $2, 28($sp)
+; MMR3-NEXT: sw $2, 16($sp)
+; MMR3-NEXT: lw $3, 68($sp)
; MMR3-NEXT: sw $2, 24($sp)
+; MMR3-NEXT: sw $6, 8($sp)
+; MMR3-NEXT: sw $4, 0($sp)
+; MMR3-NEXT: sw $2, 28($sp)
; MMR3-NEXT: sw $2, 20($sp)
-; MMR3-NEXT: sw $2, 16($sp)
-; MMR3-NEXT: swp $6, 8($sp)
-; MMR3-NEXT: swp $4, 0($sp)
-; MMR3-NEXT: lw $2, 68($sp)
-; MMR3-NEXT: srl16 $3, $2, 3
-; MMR3-NEXT: andi $3, $3, 12
+; MMR3-NEXT: sw $7, 12($sp)
+; MMR3-NEXT: sw $5, 4($sp)
+; MMR3-NEXT: srl16 $2, $3, 3
+; MMR3-NEXT: andi $2, $2, 12
; MMR3-NEXT: addiur1sp $4, 0
-; MMR3-NEXT: addu16 $4, $4, $3
+; MMR3-NEXT: addu16 $4, $4, $2
; MMR3-NEXT: lw16 $6, 8($4)
-; MMR3-NEXT: lw16 $7, 4($4)
-; MMR3-NEXT: andi16 $5, $2, 31
-; MMR3-NEXT: sllv $16, $7, $5
-; MMR3-NEXT: srl16 $2, $6, 1
+; MMR3-NEXT: lw16 $2, 4($4)
+; MMR3-NEXT: andi16 $5, $3, 31
+; MMR3-NEXT: sllv $7, $2, $5
+; MMR3-NEXT: srl16 $3, $6, 1
; MMR3-NEXT: xori $1, $5, 31
-; MMR3-NEXT: srlv $3, $2, $1
-; MMR3-NEXT: lw16 $2, 0($4)
-; MMR3-NEXT: sllv $17, $2, $5
-; MMR3-NEXT: srl16 $2, $7, 1
+; MMR3-NEXT: srlv $3, $3, $1
+; MMR3-NEXT: lw16 $16, 0($4)
+; MMR3-NEXT: sllv $16, $16, $5
+; MMR3-NEXT: srl16 $2, $2, 1
; MMR3-NEXT: srlv $2, $2, $1
-; MMR3-NEXT: or16 $2, $17
-; MMR3-NEXT: or16 $3, $16
+; MMR3-NEXT: or16 $2, $16
+; MMR3-NEXT: or16 $3, $7
; MMR3-NEXT: sllv $6, $6, $5
; MMR3-NEXT: lw16 $7, 12($4)
; MMR3-NEXT: srl16 $4, $7, 1
; MMR3-NEXT: srlv $4, $4, $1
; MMR3-NEXT: or16 $4, $6
; MMR3-NEXT: sllv $5, $7, $5
-; MMR3-NEXT: lwp $16, 32($sp)
+; MMR3-NEXT: lw $16, 36($sp) # 4-byte Folded Reload
; MMR3-NEXT: addiusp 40
; MMR3-NEXT: jrc $ra
;
; MMR6-LABEL: shl_i128:
; MMR6: # %bb.0: # %entry
-; MMR6-NEXT: addiu $sp, $sp, -32
-; MMR6-NEXT: .cfi_def_cfa_offset 32
+; MMR6-NEXT: addiu $sp, $sp, -40
+; MMR6-NEXT: .cfi_def_cfa_offset 40
+; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill
+; MMR6-NEXT: .cfi_offset 16, -4
; MMR6-NEXT: li16 $2, 0
-; MMR6-NEXT: sw $2, 28($sp)
+; MMR6-NEXT: sw $2, 16($sp)
+; MMR6-NEXT: lw $3, 68($sp)
; MMR6-NEXT: sw $2, 24($sp)
+; MMR6-NEXT: sw $6, 8($sp)
+; MMR6-NEXT: sw $4, 0($sp)
+; MMR6-NEXT: sw $2, 28($sp)
; MMR6-NEXT: sw $2, 20($sp)
-; MMR6-NEXT: sw $2, 16($sp)
; MMR6-NEXT: sw $7, 12($sp)
-; MMR6-NEXT: sw $6, 8($sp)
; MMR6-NEXT: sw $5, 4($sp)
-; MMR6-NEXT: sw $4, 0($sp)
-; MMR6-NEXT: lw $2, 60($sp)
-; MMR6-NEXT: srl16 $3, $2, 3
-; MMR6-NEXT: andi $3, $3, 12
+; MMR6-NEXT: srl16 $2, $3, 3
+; MMR6-NEXT: andi $2, $2, 12
; MMR6-NEXT: addiu $4, $sp, 0
-; MMR6-NEXT: addu16 $4, $4, $3
+; MMR6-NEXT: addu16 $4, $4, $2
; MMR6-NEXT: lw16 $5, 8($4)
-; MMR6-NEXT: lw16 $3, 4($4)
-; MMR6-NEXT: andi16 $6, $2, 31
-; MMR6-NEXT: sllv $1, $3, $6
-; MMR6-NEXT: srl16 $2, $5, 1
+; MMR6-NEXT: lw16 $2, 4($4)
+; MMR6-NEXT: andi16 $6, $3, 31
+; MMR6-NEXT: sllv $1, $2, $6
+; MMR6-NEXT: srl16 $3, $5, 1
; MMR6-NEXT: xori $7, $6, 31
-; MMR6-NEXT: srlv $8, $2, $7
-; MMR6-NEXT: lw16 $2, 0($4)
-; MMR6-NEXT: sllv $2, $2, $6
-; MMR6-NEXT: srl16 $3, $3, 1
; MMR6-NEXT: srlv $3, $3, $7
-; MMR6-NEXT: or $2, $2, $3
-; MMR6-NEXT: or $3, $1, $8
+; MMR6-NEXT: lw16 $16, 0($4)
+; MMR6-NEXT: sllv $8, $16, $6
+; MMR6-NEXT: srl16 $2, $2, 1
+; MMR6-NEXT: srlv $2, $2, $7
+; MMR6-NEXT: or $2, $8, $2
+; MMR6-NEXT: or $3, $1, $3
; MMR6-NEXT: sllv $1, $5, $6
; MMR6-NEXT: lw16 $5, 12($4)
; MMR6-NEXT: srl16 $4, $5, 1
; MMR6-NEXT: srlv $4, $4, $7
; MMR6-NEXT: or $4, $1, $4
; MMR6-NEXT: sllv $5, $5, $6
-; MMR6-NEXT: addiu $sp, $sp, 32
+; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload
+; MMR6-NEXT: addiu $sp, $sp, 40
; MMR6-NEXT: jrc $ra
entry:
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
index ae25feeb8893c..82ddef55eba30 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
@@ -13,22 +13,22 @@ define void @foo1(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
; CHECK-NEXT: stw 30, 56(1) # 4-byte Folded Spill
; CHECK-NEXT: li 6, 0
; CHECK-NEXT: mtctr 7
-; CHECK-NEXT: addi 7, 1, 20
+; CHECK-NEXT: addi 7, 1, 16
; CHECK-NEXT: .LBB0_1: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: lwz 8, 0(4)
-; CHECK-NEXT: lwz 9, 4(4)
-; CHECK-NEXT: lwz 10, 8(4)
-; CHECK-NEXT: lwz 11, 12(4)
+; CHECK-NEXT: lwz 8, 4(4)
+; CHECK-NEXT: lwz 9, 12(4)
+; CHECK-NEXT: lwz 10, 0(4)
+; CHECK-NEXT: lwz 11, 8(4)
; CHECK-NEXT: lwz 12, 12(5)
-; CHECK-NEXT: stw 6, 48(1)
-; CHECK-NEXT: stw 6, 44(1)
; CHECK-NEXT: stw 6, 40(1)
+; CHECK-NEXT: stw 6, 32(1)
+; CHECK-NEXT: stw 6, 44(1)
; CHECK-NEXT: stw 6, 36(1)
-; CHECK-NEXT: stw 11, 32(1)
-; CHECK-NEXT: stw 10, 28(1)
+; CHECK-NEXT: stw 11, 24(1)
+; CHECK-NEXT: stw 10, 16(1)
; CHECK-NEXT: clrlwi 10, 12, 27
-; CHECK-NEXT: stw 9, 24(1)
+; CHECK-NEXT: stw 9, 28(1)
; CHECK-NEXT: stw 8, 20(1)
; CHECK-NEXT: rlwinm 8, 12, 29, 28, 29
; CHECK-NEXT: lwzux 9, 8, 7
@@ -84,22 +84,22 @@ define void @foo2(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
; CHECK-NEXT: addi 6, 1, 24
; CHECK-NEXT: .LBB1_1: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: lwz 7, 0(4)
-; CHECK-NEXT: lwz 8, 4(4)
+; CHECK-NEXT: lwz 7, 8(4)
+; CHECK-NEXT: lwz 8, 0(4)
; CHECK-NEXT: lwz 11, 12(5)
-; CHECK-NEXT: lwz 9, 8(4)
+; CHECK-NEXT: lwz 9, 4(4)
; CHECK-NEXT: lwz 10, 12(4)
-; CHECK-NEXT: stw 8, 28(1)
+; CHECK-NEXT: stw 7, 32(1)
+; CHECK-NEXT: srawi 7, 8, 31
+; CHECK-NEXT: stw 8, 24(1)
; CHECK-NEXT: rlwinm 8, 11, 29, 28, 29
-; CHECK-NEXT: stw 7, 24(1)
-; CHECK-NEXT: srawi 7, 7, 31
; CHECK-NEXT: stw 10, 36(1)
; CHECK-NEXT: clrlwi 10, 11, 27
-; CHECK-NEXT: stw 9, 32(1)
+; CHECK-NEXT: stw 9, 28(1)
; CHECK-NEXT: subfic 12, 10, 32
; CHECK-NEXT: stw 7, 20(1)
-; CHECK-NEXT: stw 7, 16(1)
; CHECK-NEXT: stw 7, 12(1)
+; CHECK-NEXT: stw 7, 16(1)
; CHECK-NEXT: stw 7, 8(1)
; CHECK-NEXT: sub 7, 6, 8
; CHECK-NEXT: lwz 8, 4(7)
@@ -152,24 +152,24 @@ define void @foo3(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
; CHECK-NEXT: stw 30, 56(1) # 4-byte Folded Spill
; CHECK-NEXT: li 6, 0
; CHECK-NEXT: mtctr 7
-; CHECK-NEXT: addi 7, 1, 36
+; CHECK-NEXT: addi 7, 1, 32
; CHECK-NEXT: .LBB2_1: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: lwz 8, 0(4)
; CHECK-NEXT: lwz 12, 12(5)
-; CHECK-NEXT: lwz 9, 4(4)
-; CHECK-NEXT: lwz 10, 8(4)
+; CHECK-NEXT: lwz 9, 8(4)
+; CHECK-NEXT: lwz 10, 4(4)
; CHECK-NEXT: lwz 11, 12(4)
-; CHECK-NEXT: stw 8, 36(1)
+; CHECK-NEXT: stw 8, 32(1)
; CHECK-NEXT: rlwinm 8, 12, 29, 28, 29
-; CHECK-NEXT: stw 6, 32(1)
-; CHECK-NEXT: sub 8, 7, 8
; CHECK-NEXT: stw 6, 28(1)
-; CHECK-NEXT: stw 6, 24(1)
+; CHECK-NEXT: sub 8, 7, 8
; CHECK-NEXT: stw 6, 20(1)
-; CHECK-NEXT: stw 11, 48(1)
+; CHECK-NEXT: stw 6, 24(1)
+; CHECK-NEXT: stw 6, 16(1)
+; CHECK-NEXT: stw 11, 44(1)
; CHECK-NEXT: clrlwi 11, 12, 27
-; CHECK-NEXT: stw 10, 44(1)
+; CHECK-NEXT: stw 10, 36(1)
; CHECK-NEXT: subfic 0, 11, 32
; CHECK-NEXT: stw 9, 40(1)
; CHECK-NEXT: lwz 9, 4(8)
diff --git a/llvm/test/CodeGen/PowerPC/pr59074.ll b/llvm/test/CodeGen/PowerPC/pr59074.ll
index cc90300aafcea..8932733db1e40 100644
--- a/llvm/test/CodeGen/PowerPC/pr59074.ll
+++ b/llvm/test/CodeGen/PowerPC/pr59074.ll
@@ -32,20 +32,21 @@ define void @pr59074(ptr %0) {
; LE32-NEXT: li 7, 0
; LE32-NEXT: li 8, 12
; LE32-NEXT: xxswapd 0, 0
+; LE32-NEXT: rlwimi 5, 6, 0, 30, 28
; LE32-NEXT: addi 4, 4, -12
; LE32-NEXT: rlwinm 9, 4, 29, 28, 29
-; LE32-NEXT: stxvd2x 0, 6, 5
-; LE32-NEXT: stw 7, 44(1)
+; LE32-NEXT: stxvd2x 0, 0, 5
; LE32-NEXT: stw 7, 40(1)
-; LE32-NEXT: stw 7, 36(1)
; LE32-NEXT: stw 8, 16(1)
+; LE32-NEXT: stw 7, 44(1)
+; LE32-NEXT: stw 7, 36(1)
; LE32-NEXT: clrlwi 4, 4, 27
; LE32-NEXT: lwzux 5, 9, 6
-; LE32-NEXT: subfic 11, 4, 32
; LE32-NEXT: lwz 6, 8(9)
; LE32-NEXT: lwz 7, 4(9)
; LE32-NEXT: lwz 8, 12(9)
; LE32-NEXT: xori 9, 4, 31
+; LE32-NEXT: subfic 11, 4, 32
; LE32-NEXT: srw 5, 5, 4
; LE32-NEXT: slwi 10, 6, 1
; LE32-NEXT: srw 6, 6, 4
@@ -90,8 +91,8 @@ define void @pr59074(ptr %0) {
; BE32-NEXT: stxvw4x 0, 0, 5
; BE32-NEXT: stw 6, -36(1)
; BE32-NEXT: addi 4, 4, -12
-; BE32-NEXT: stw 7, -40(1)
; BE32-NEXT: stw 7, -44(1)
+; BE32-NEXT: stw 7, -40(1)
; BE32-NEXT: stw 7, -48(1)
; BE32-NEXT: rlwinm 9, 4, 29, 28, 29
; BE32-NEXT: clrlwi 4, 4, 27
diff --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
index f6fdb4ae20794..29b91d8ef89dc 100644
--- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -229,29 +229,41 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stwu 1, -48(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
; LE-32BIT-NEXT: li 6, 0
-; LE-32BIT-NEXT: lwz 8, 4(3)
-; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 8, 8(3)
+; LE-32BIT-NEXT: lwz 9, 4(3)
; LE-32BIT-NEXT: lwz 3, 12(3)
; LE-32BIT-NEXT: lwz 4, 12(4)
-; LE-32BIT-NEXT: stw 3, 44(1)
-; LE-32BIT-NEXT: addi 3, 1, 32
-; LE-32BIT-NEXT: clrlwi 4, 4, 28
; LE-32BIT-NEXT: stw 6, 28(1)
-; LE-32BIT-NEXT: sub 3, 3, 4
-; LE-32BIT-NEXT: stw 6, 24(1)
; LE-32BIT-NEXT: stw 6, 20(1)
+; LE-32BIT-NEXT: stw 6, 24(1)
; LE-32BIT-NEXT: stw 6, 16(1)
-; LE-32BIT-NEXT: stw 9, 40(1)
-; LE-32BIT-NEXT: stw 8, 36(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 0, 28, 29
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: addi 3, 1, 32
+; LE-32BIT-NEXT: stw 9, 36(1)
+; LE-32BIT-NEXT: sub 3, 3, 6
+; LE-32BIT-NEXT: stw 8, 40(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
; LE-32BIT-NEXT: stw 7, 32(1)
-; LE-32BIT-NEXT: lwz 4, 4(3)
-; LE-32BIT-NEXT: lwz 6, 0(3)
-; LE-32BIT-NEXT: lwz 7, 8(3)
-; LE-32BIT-NEXT: lwz 3, 12(3)
-; LE-32BIT-NEXT: stw 7, 8(5)
+; LE-32BIT-NEXT: subfic 9, 4, 32
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 10, 6, 4
+; LE-32BIT-NEXT: lwz 3, 8(3)
+; LE-32BIT-NEXT: slw 11, 7, 9
+; LE-32BIT-NEXT: slw 6, 6, 9
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 9, 3, 9
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: or 3, 6, 3
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 9, 8
+; LE-32BIT-NEXT: srw 4, 7, 4
; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: stw 6, 0(5)
-; LE-32BIT-NEXT: stw 4, 4(5)
+; LE-32BIT-NEXT: or 3, 11, 10
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
@@ -301,30 +313,42 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-LABEL: shl_16bytes:
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -48(1)
-; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 7, 4(3)
; LE-32BIT-NEXT: li 6, 0
-; LE-32BIT-NEXT: lwz 8, 4(3)
-; LE-32BIT-NEXT: lwz 9, 8(3)
-; LE-32BIT-NEXT: lwz 3, 12(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: lwz 9, 0(3)
+; LE-32BIT-NEXT: lwz 3, 8(3)
; LE-32BIT-NEXT: lwz 4, 12(4)
-; LE-32BIT-NEXT: stw 6, 44(1)
; LE-32BIT-NEXT: stw 6, 40(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 28
-; LE-32BIT-NEXT: stw 6, 36(1)
; LE-32BIT-NEXT: stw 6, 32(1)
-; LE-32BIT-NEXT: stw 3, 28(1)
+; LE-32BIT-NEXT: stw 6, 44(1)
+; LE-32BIT-NEXT: stw 6, 36(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 0, 28, 29
+; LE-32BIT-NEXT: stw 3, 24(1)
; LE-32BIT-NEXT: addi 3, 1, 16
-; LE-32BIT-NEXT: stw 9, 24(1)
-; LE-32BIT-NEXT: stw 8, 20(1)
-; LE-32BIT-NEXT: stw 7, 16(1)
-; LE-32BIT-NEXT: lwzux 3, 4, 3
-; LE-32BIT-NEXT: lwz 6, 4(4)
-; LE-32BIT-NEXT: lwz 7, 12(4)
-; LE-32BIT-NEXT: lwz 4, 8(4)
+; LE-32BIT-NEXT: stw 9, 16(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 8, 28(1)
+; LE-32BIT-NEXT: subfic 8, 4, 32
+; LE-32BIT-NEXT: stw 7, 20(1)
+; LE-32BIT-NEXT: lwzux 3, 6, 3
+; LE-32BIT-NEXT: lwz 9, 4(6)
+; LE-32BIT-NEXT: slw 3, 3, 4
+; LE-32BIT-NEXT: lwz 7, 8(6)
+; LE-32BIT-NEXT: lwz 6, 12(6)
+; LE-32BIT-NEXT: slw 11, 9, 4
+; LE-32BIT-NEXT: srw 9, 9, 8
+; LE-32BIT-NEXT: srw 10, 7, 8
+; LE-32BIT-NEXT: srw 8, 6, 8
+; LE-32BIT-NEXT: slw 7, 7, 4
+; LE-32BIT-NEXT: slw 4, 6, 4
+; LE-32BIT-NEXT: or 3, 3, 9
+; LE-32BIT-NEXT: stw 4, 12(5)
+; LE-32BIT-NEXT: or 4, 7, 8
; LE-32BIT-NEXT: stw 3, 0(5)
+; LE-32BIT-NEXT: or 3, 11, 10
; LE-32BIT-NEXT: stw 4, 8(5)
-; LE-32BIT-NEXT: stw 7, 12(5)
-; LE-32BIT-NEXT: stw 6, 4(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
@@ -380,31 +404,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-LABEL: ashr_16bytes:
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -48(1)
-; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 7, 8(3)
; LE-32BIT-NEXT: addi 6, 1, 32
-; LE-32BIT-NEXT: lwz 8, 4(3)
-; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 8, 0(3)
+; LE-32BIT-NEXT: lwz 9, 4(3)
; LE-32BIT-NEXT: lwz 3, 12(3)
; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: stw 3, 44(1)
-; LE-32BIT-NEXT: srawi 3, 7, 31
-; LE-32BIT-NEXT: clrlwi 4, 4, 28
-; LE-32BIT-NEXT: stw 9, 40(1)
-; LE-32BIT-NEXT: stw 8, 36(1)
-; LE-32BIT-NEXT: stw 7, 32(1)
+; LE-32BIT-NEXT: srawi 3, 8, 31
+; LE-32BIT-NEXT: stw 7, 40(1)
+; LE-32BIT-NEXT: rlwinm 7, 4, 0, 28, 29
+; LE-32BIT-NEXT: stw 9, 36(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 8, 32(1)
+; LE-32BIT-NEXT: subfic 9, 4, 32
; LE-32BIT-NEXT: stw 3, 28(1)
-; LE-32BIT-NEXT: stw 3, 24(1)
; LE-32BIT-NEXT: stw 3, 20(1)
+; LE-32BIT-NEXT: stw 3, 24(1)
; LE-32BIT-NEXT: stw 3, 16(1)
-; LE-32BIT-NEXT: sub 3, 6, 4
-; LE-32BIT-NEXT: lwz 4, 4(3)
-; LE-32BIT-NEXT: lwz 6, 0(3)
-; LE-32BIT-NEXT: lwz 7, 8(3)
-; LE-32BIT-NEXT: lwz 3, 12(3)
-; LE-32BIT-NEXT: stw 7, 8(5)
+; LE-32BIT-NEXT: sub 3, 6, 7
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 10, 6, 4
+; LE-32BIT-NEXT: lwz 3, 8(3)
+; LE-32BIT-NEXT: slw 11, 7, 9
+; LE-32BIT-NEXT: slw 6, 6, 9
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 9, 3, 9
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: or 3, 6, 3
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 9, 8
+; LE-32BIT-NEXT: sraw 4, 7, 4
; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: stw 6, 0(5)
-; LE-32BIT-NEXT: stw 4, 4(5)
+; LE-32BIT-NEXT: or 3, 11, 10
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
@@ -422,20 +458,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-NEXT: lxvd2x 1, 0, 3
; LE-64BIT-NEXT: xxlxor 2, 2, 2
; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: li 8, 32
; LE-64BIT-NEXT: lxvd2x 0, 3, 6
; LE-64BIT-NEXT: lwz 3, 0(4)
; LE-64BIT-NEXT: li 4, 48
; LE-64BIT-NEXT: stxvd2x 2, 7, 4
-; LE-64BIT-NEXT: li 4, 32
-; LE-64BIT-NEXT: clrldi 3, 3, 59
-; LE-64BIT-NEXT: stxvd2x 2, 7, 4
+; LE-64BIT-NEXT: stxvd2x 2, 7, 8
+; LE-64BIT-NEXT: rlwinm 4, 3, 0, 27, 28
+; LE-64BIT-NEXT: rlwinm 3, 3, 3, 26, 28
; LE-64BIT-NEXT: stxvd2x 0, 7, 6
; LE-64BIT-NEXT: stxvd2x 1, 0, 7
-; LE-64BIT-NEXT: lxvd2x 0, 7, 3
-; LE-64BIT-NEXT: add 3, 7, 3
-; LE-64BIT-NEXT: lxvd2x 1, 3, 6
-; LE-64BIT-NEXT: stxvd2x 1, 5, 6
-; LE-64BIT-NEXT: stxvd2x 0, 0, 5
+; LE-64BIT-NEXT: ldux 6, 4, 7
+; LE-64BIT-NEXT: subfic 7, 3, 64
+; LE-64BIT-NEXT: ld 8, 8(4)
+; LE-64BIT-NEXT: ld 9, 16(4)
+; LE-64BIT-NEXT: ld 4, 24(4)
+; LE-64BIT-NEXT: srd 6, 6, 3
+; LE-64BIT-NEXT: sld 10, 8, 7
+; LE-64BIT-NEXT: sld 11, 4, 7
+; LE-64BIT-NEXT: srd 8, 8, 3
+; LE-64BIT-NEXT: sld 7, 9, 7
+; LE-64BIT-NEXT: or 6, 10, 6
+; LE-64BIT-NEXT: srd 10, 9, 3
+; LE-64BIT-NEXT: srd 3, 4, 3
+; LE-64BIT-NEXT: or 7, 7, 8
+; LE-64BIT-NEXT: std 3, 24(5)
+; LE-64BIT-NEXT: or 3, 11, 10
+; LE-64BIT-NEXT: std 7, 8(5)
+; LE-64BIT-NEXT: std 6, 0(5)
+; LE-64BIT-NEXT: std 3, 16(5)
; LE-64BIT-NEXT: blr
;
; BE-LABEL: lshr_32bytes:
@@ -445,79 +496,126 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; BE-NEXT: ld 8, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: li 10, 0
-; BE-NEXT: std 10, 24(9)
-; BE-NEXT: std 10, 16(9)
-; BE-NEXT: std 10, 8(9)
-; BE-NEXT: std 10, -64(1)
-; BE-NEXT: std 3, 56(9)
-; BE-NEXT: clrlwi 3, 4, 27
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -32
+; BE-NEXT: std 9, -40(1)
+; BE-NEXT: std 9, -48(1)
+; BE-NEXT: std 9, -56(1)
+; BE-NEXT: std 9, -64(1)
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: rlwinm 3, 4, 0, 27, 28
; BE-NEXT: neg 3, 3
-; BE-NEXT: std 8, 48(9)
-; BE-NEXT: std 7, 40(9)
-; BE-NEXT: std 6, 32(9)
+; BE-NEXT: std 8, -16(1)
+; BE-NEXT: std 7, -24(1)
+; BE-NEXT: std 6, -32(1)
; BE-NEXT: extsw 3, 3
-; BE-NEXT: addi 4, 1, -32
-; BE-NEXT: ldux 3, 4, 3
-; BE-NEXT: ld 6, 8(4)
-; BE-NEXT: ld 7, 24(4)
-; BE-NEXT: ld 4, 16(4)
+; BE-NEXT: ldux 3, 10, 3
+; BE-NEXT: rlwinm 4, 4, 3, 26, 28
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 6, 8(10)
+; BE-NEXT: ld 7, 24(10)
+; BE-NEXT: ld 8, 16(10)
+; BE-NEXT: sld 10, 3, 9
+; BE-NEXT: srd 3, 3, 4
; BE-NEXT: std 3, 0(5)
-; BE-NEXT: std 4, 16(5)
+; BE-NEXT: srd 11, 6, 4
+; BE-NEXT: srd 7, 7, 4
+; BE-NEXT: sld 6, 6, 9
+; BE-NEXT: sld 9, 8, 9
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 7, 9, 7
+; BE-NEXT: or 6, 6, 8
+; BE-NEXT: std 6, 16(5)
; BE-NEXT: std 7, 24(5)
-; BE-NEXT: std 6, 8(5)
+; BE-NEXT: std 10, 8(5)
; BE-NEXT: blr
;
; LE-32BIT-LABEL: lshr_32bytes:
; LE-32BIT: # %bb.0:
-; LE-32BIT-NEXT: stwu 1, -80(1)
+; LE-32BIT-NEXT: stwu 1, -112(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
; LE-32BIT-NEXT: li 6, 0
-; LE-32BIT-NEXT: lwz 8, 4(3)
-; LE-32BIT-NEXT: lwz 9, 8(3)
-; LE-32BIT-NEXT: lwz 10, 12(3)
-; LE-32BIT-NEXT: lwz 11, 16(3)
-; LE-32BIT-NEXT: lwz 12, 20(3)
-; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 8, 8(3)
+; LE-32BIT-NEXT: lwz 9, 16(3)
+; LE-32BIT-NEXT: lwz 10, 24(3)
+; LE-32BIT-NEXT: lwz 11, 4(3)
+; LE-32BIT-NEXT: lwz 12, 12(3)
+; LE-32BIT-NEXT: lwz 0, 20(3)
; LE-32BIT-NEXT: lwz 3, 28(3)
; LE-32BIT-NEXT: lwz 4, 28(4)
-; LE-32BIT-NEXT: stw 3, 76(1)
-; LE-32BIT-NEXT: addi 3, 1, 48
-; LE-32BIT-NEXT: clrlwi 4, 4, 27
; LE-32BIT-NEXT: stw 6, 44(1)
-; LE-32BIT-NEXT: sub 3, 3, 4
-; LE-32BIT-NEXT: stw 6, 40(1)
; LE-32BIT-NEXT: stw 6, 36(1)
-; LE-32BIT-NEXT: stw 6, 32(1)
; LE-32BIT-NEXT: stw 6, 28(1)
-; LE-32BIT-NEXT: stw 6, 24(1)
; LE-32BIT-NEXT: stw 6, 20(1)
+; LE-32BIT-NEXT: stw 6, 40(1)
+; LE-32BIT-NEXT: stw 6, 32(1)
+; LE-32BIT-NEXT: stw 6, 24(1)
; LE-32BIT-NEXT: stw 6, 16(1)
-; LE-32BIT-NEXT: stw 0, 72(1)
-; LE-32BIT-NEXT: stw 12, 68(1)
-; LE-32BIT-NEXT: stw 11, 64(1)
-; LE-32BIT-NEXT: stw 10, 60(1)
-; LE-32BIT-NEXT: stw 9, 56(1)
-; LE-32BIT-NEXT: stw 8, 52(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 0, 27, 29
+; LE-32BIT-NEXT: stw 3, 76(1)
+; LE-32BIT-NEXT: addi 3, 1, 48
+; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: sub 3, 3, 6
+; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 0, 68(1)
+; LE-32BIT-NEXT: subfic 0, 4, 32
+; LE-32BIT-NEXT: stw 12, 60(1)
+; LE-32BIT-NEXT: stw 11, 52(1)
+; LE-32BIT-NEXT: stw 10, 72(1)
+; LE-32BIT-NEXT: stw 9, 64(1)
+; LE-32BIT-NEXT: stw 8, 56(1)
; LE-32BIT-NEXT: stw 7, 48(1)
-; LE-32BIT-NEXT: lwz 4, 4(3)
-; LE-32BIT-NEXT: lwz 6, 0(3)
-; LE-32BIT-NEXT: lwz 7, 12(3)
-; LE-32BIT-NEXT: lwz 8, 8(3)
-; LE-32BIT-NEXT: lwz 9, 20(3)
-; LE-32BIT-NEXT: lwz 10, 16(3)
-; LE-32BIT-NEXT: lwz 11, 24(3)
-; LE-32BIT-NEXT: lwz 3, 28(3)
-; LE-32BIT-NEXT: stw 11, 24(5)
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 30, 6, 4
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: slw 29, 7, 0
+; LE-32BIT-NEXT: lwz 10, 20(3)
+; LE-32BIT-NEXT: srw 28, 8, 4
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: slw 27, 9, 0
+; LE-32BIT-NEXT: lwz 12, 28(3)
+; LE-32BIT-NEXT: slw 6, 6, 0
+; LE-32BIT-NEXT: lwz 3, 24(3)
+; LE-32BIT-NEXT: srw 26, 10, 4
+; LE-32BIT-NEXT: slw 25, 11, 0
+; LE-32BIT-NEXT: slw 8, 8, 0
+; LE-32BIT-NEXT: slw 10, 10, 0
+; LE-32BIT-NEXT: slw 0, 3, 0
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: srw 12, 12, 4
+; LE-32BIT-NEXT: or 3, 10, 3
+; LE-32BIT-NEXT: srw 11, 11, 4
+; LE-32BIT-NEXT: stw 3, 24(5)
+; LE-32BIT-NEXT: or 3, 0, 12
; LE-32BIT-NEXT: stw 3, 28(5)
-; LE-32BIT-NEXT: stw 10, 16(5)
-; LE-32BIT-NEXT: stw 9, 20(5)
-; LE-32BIT-NEXT: stw 8, 8(5)
-; LE-32BIT-NEXT: stw 7, 12(5)
-; LE-32BIT-NEXT: stw 6, 0(5)
-; LE-32BIT-NEXT: stw 4, 4(5)
-; LE-32BIT-NEXT: addi 1, 1, 80
+; LE-32BIT-NEXT: or 3, 8, 11
+; LE-32BIT-NEXT: srw 9, 9, 4
+; LE-32BIT-NEXT: stw 3, 16(5)
+; LE-32BIT-NEXT: or 3, 25, 26
+; LE-32BIT-NEXT: stw 3, 20(5)
+; LE-32BIT-NEXT: or 3, 6, 9
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 27, 28
+; LE-32BIT-NEXT: srw 4, 7, 4
+; LE-32BIT-NEXT: stw 3, 12(5)
+; LE-32BIT-NEXT: or 3, 29, 30
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: addi 1, 1, 112
; LE-32BIT-NEXT: blr
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -530,26 +628,41 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-LABEL: shl_32bytes:
; LE-64BIT: # %bb.0:
; LE-64BIT-NEXT: li 6, 16
-; LE-64BIT-NEXT: lxvd2x 1, 0, 3
+; LE-64BIT-NEXT: lwz 4, 0(4)
; LE-64BIT-NEXT: xxlxor 2, 2, 2
-; LE-64BIT-NEXT: li 7, 48
+; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: lxvd2x 1, 0, 3
+; LE-64BIT-NEXT: addi 8, 1, -32
; LE-64BIT-NEXT: lxvd2x 0, 3, 6
-; LE-64BIT-NEXT: lwz 3, 0(4)
-; LE-64BIT-NEXT: addi 4, 1, -64
-; LE-64BIT-NEXT: stxvd2x 2, 4, 6
-; LE-64BIT-NEXT: clrlwi 3, 3, 27
-; LE-64BIT-NEXT: stxvd2x 0, 4, 7
-; LE-64BIT-NEXT: li 7, 32
+; LE-64BIT-NEXT: stxvd2x 2, 7, 6
+; LE-64BIT-NEXT: li 6, 48
+; LE-64BIT-NEXT: rlwinm 3, 4, 0, 27, 28
+; LE-64BIT-NEXT: rlwinm 4, 4, 3, 26, 28
; LE-64BIT-NEXT: neg 3, 3
-; LE-64BIT-NEXT: stxvd2x 1, 4, 7
-; LE-64BIT-NEXT: stxvd2x 2, 0, 4
+; LE-64BIT-NEXT: stxvd2x 0, 7, 6
+; LE-64BIT-NEXT: li 6, 32
; LE-64BIT-NEXT: extsw 3, 3
-; LE-64BIT-NEXT: addi 4, 1, -32
-; LE-64BIT-NEXT: lxvd2x 0, 4, 3
-; LE-64BIT-NEXT: add 3, 4, 3
-; LE-64BIT-NEXT: lxvd2x 1, 3, 6
-; LE-64BIT-NEXT: stxvd2x 1, 5, 6
-; LE-64BIT-NEXT: stxvd2x 0, 0, 5
+; LE-64BIT-NEXT: stxvd2x 1, 7, 6
+; LE-64BIT-NEXT: stxvd2x 2, 0, 7
+; LE-64BIT-NEXT: subfic 6, 4, 64
+; LE-64BIT-NEXT: ldux 3, 8, 3
+; LE-64BIT-NEXT: ld 7, 16(8)
+; LE-64BIT-NEXT: ld 9, 24(8)
+; LE-64BIT-NEXT: ld 8, 8(8)
+; LE-64BIT-NEXT: srd 10, 7, 6
+; LE-64BIT-NEXT: sld 9, 9, 4
+; LE-64BIT-NEXT: sld 7, 7, 4
+; LE-64BIT-NEXT: or 9, 9, 10
+; LE-64BIT-NEXT: srd 10, 8, 6
+; LE-64BIT-NEXT: srd 6, 3, 6
+; LE-64BIT-NEXT: sld 8, 8, 4
+; LE-64BIT-NEXT: sld 3, 3, 4
+; LE-64BIT-NEXT: or 6, 8, 6
+; LE-64BIT-NEXT: std 3, 0(5)
+; LE-64BIT-NEXT: or 3, 7, 10
+; LE-64BIT-NEXT: std 9, 24(5)
+; LE-64BIT-NEXT: std 6, 8(5)
+; LE-64BIT-NEXT: std 3, 16(5)
; LE-64BIT-NEXT: blr
;
; BE-LABEL: shl_32bytes:
@@ -559,75 +672,123 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; BE-NEXT: ld 8, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: li 10, 0
-; BE-NEXT: std 10, 56(9)
-; BE-NEXT: std 10, 48(9)
-; BE-NEXT: std 10, 40(9)
-; BE-NEXT: std 10, 32(9)
-; BE-NEXT: std 3, 24(9)
-; BE-NEXT: std 8, 16(9)
-; BE-NEXT: std 7, 8(9)
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -64
+; BE-NEXT: std 9, -8(1)
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 9, -24(1)
+; BE-NEXT: std 9, -32(1)
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 8, -48(1)
+; BE-NEXT: std 7, -56(1)
; BE-NEXT: std 6, -64(1)
-; BE-NEXT: clrldi 3, 4, 59
-; BE-NEXT: ldux 4, 3, 9
-; BE-NEXT: ld 6, 8(3)
-; BE-NEXT: ld 7, 24(3)
-; BE-NEXT: ld 3, 16(3)
-; BE-NEXT: std 4, 0(5)
-; BE-NEXT: std 3, 16(5)
-; BE-NEXT: std 7, 24(5)
-; BE-NEXT: std 6, 8(5)
+; BE-NEXT: rlwinm 3, 4, 0, 27, 28
+; BE-NEXT: ldux 6, 3, 10
+; BE-NEXT: rlwinm 4, 4, 3, 26, 28
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 16(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: sld 6, 6, 4
+; BE-NEXT: srd 10, 7, 9
+; BE-NEXT: sld 11, 8, 4
+; BE-NEXT: srd 8, 8, 9
+; BE-NEXT: srd 9, 3, 9
+; BE-NEXT: sld 7, 7, 4
+; BE-NEXT: sld 3, 3, 4
+; BE-NEXT: or 10, 11, 10
+; BE-NEXT: or 6, 6, 8
+; BE-NEXT: or 7, 7, 9
+; BE-NEXT: std 3, 24(5)
+; BE-NEXT: std 7, 16(5)
+; BE-NEXT: std 6, 0(5)
+; BE-NEXT: std 10, 8(5)
; BE-NEXT: blr
;
; LE-32BIT-LABEL: shl_32bytes:
; LE-32BIT: # %bb.0:
-; LE-32BIT-NEXT: stwu 1, -80(1)
-; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: stwu 1, -112(1)
+; LE-32BIT-NEXT: lwz 7, 4(3)
; LE-32BIT-NEXT: li 6, 0
-; LE-32BIT-NEXT: lwz 8, 4(3)
-; LE-32BIT-NEXT: lwz 9, 8(3)
-; LE-32BIT-NEXT: lwz 10, 12(3)
-; LE-32BIT-NEXT: lwz 11, 16(3)
-; LE-32BIT-NEXT: lwz 12, 20(3)
-; LE-32BIT-NEXT: lwz 0, 24(3)
-; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: lwz 9, 20(3)
+; LE-32BIT-NEXT: lwz 10, 28(3)
+; LE-32BIT-NEXT: lwz 11, 0(3)
+; LE-32BIT-NEXT: lwz 12, 8(3)
+; LE-32BIT-NEXT: lwz 0, 16(3)
+; LE-32BIT-NEXT: lwz 3, 24(3)
; LE-32BIT-NEXT: lwz 4, 28(4)
-; LE-32BIT-NEXT: stw 6, 76(1)
+; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 6, 72(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 27
-; LE-32BIT-NEXT: stw 6, 68(1)
; LE-32BIT-NEXT: stw 6, 64(1)
-; LE-32BIT-NEXT: stw 6, 60(1)
; LE-32BIT-NEXT: stw 6, 56(1)
-; LE-32BIT-NEXT: stw 6, 52(1)
; LE-32BIT-NEXT: stw 6, 48(1)
-; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: stw 6, 76(1)
+; LE-32BIT-NEXT: stw 6, 68(1)
+; LE-32BIT-NEXT: stw 6, 60(1)
+; LE-32BIT-NEXT: stw 6, 52(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 0, 27, 29
+; LE-32BIT-NEXT: stw 3, 40(1)
; LE-32BIT-NEXT: addi 3, 1, 16
-; LE-32BIT-NEXT: stw 0, 40(1)
-; LE-32BIT-NEXT: stw 12, 36(1)
-; LE-32BIT-NEXT: stw 11, 32(1)
-; LE-32BIT-NEXT: stw 10, 28(1)
-; LE-32BIT-NEXT: stw 9, 24(1)
-; LE-32BIT-NEXT: stw 8, 20(1)
-; LE-32BIT-NEXT: stw 7, 16(1)
-; LE-32BIT-NEXT: lwzux 3, 4, 3
-; LE-32BIT-NEXT: lwz 6, 4(4)
-; LE-32BIT-NEXT: lwz 7, 12(4)
-; LE-32BIT-NEXT: lwz 8, 8(4)
-; LE-32BIT-NEXT: lwz 9, 20(4)
-; LE-32BIT-NEXT: lwz 10, 16(4)
-; LE-32BIT-NEXT: lwz 11, 28(4)
-; LE-32BIT-NEXT: lwz 4, 24(4)
-; LE-32BIT-NEXT: stw 3, 0(5)
+; LE-32BIT-NEXT: stw 0, 32(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 12, 24(1)
+; LE-32BIT-NEXT: subfic 12, 4, 32
+; LE-32BIT-NEXT: stw 11, 16(1)
+; LE-32BIT-NEXT: stw 10, 44(1)
+; LE-32BIT-NEXT: stw 9, 36(1)
+; LE-32BIT-NEXT: stw 8, 28(1)
+; LE-32BIT-NEXT: stw 7, 20(1)
+; LE-32BIT-NEXT: lwzux 3, 6, 3
+; LE-32BIT-NEXT: lwz 7, 8(6)
+; LE-32BIT-NEXT: slw 3, 3, 4
+; LE-32BIT-NEXT: lwz 8, 4(6)
+; LE-32BIT-NEXT: lwz 9, 16(6)
+; LE-32BIT-NEXT: srw 30, 7, 12
+; LE-32BIT-NEXT: lwz 10, 12(6)
+; LE-32BIT-NEXT: slw 29, 8, 4
+; LE-32BIT-NEXT: lwz 11, 24(6)
+; LE-32BIT-NEXT: srw 8, 8, 12
+; LE-32BIT-NEXT: lwz 0, 20(6)
+; LE-32BIT-NEXT: srw 28, 9, 12
+; LE-32BIT-NEXT: lwz 6, 28(6)
+; LE-32BIT-NEXT: slw 27, 10, 4
+; LE-32BIT-NEXT: srw 10, 10, 12
+; LE-32BIT-NEXT: slw 7, 7, 4
+; LE-32BIT-NEXT: srw 26, 11, 12
+; LE-32BIT-NEXT: slw 25, 0, 4
+; LE-32BIT-NEXT: srw 0, 0, 12
+; LE-32BIT-NEXT: slw 9, 9, 4
+; LE-32BIT-NEXT: srw 12, 6, 12
+; LE-32BIT-NEXT: slw 11, 11, 4
+; LE-32BIT-NEXT: slw 4, 6, 4
+; LE-32BIT-NEXT: stw 4, 28(5)
+; LE-32BIT-NEXT: or 4, 11, 12
; LE-32BIT-NEXT: stw 4, 24(5)
-; LE-32BIT-NEXT: stw 11, 28(5)
-; LE-32BIT-NEXT: stw 10, 16(5)
-; LE-32BIT-NEXT: stw 9, 20(5)
-; LE-32BIT-NEXT: stw 8, 8(5)
-; LE-32BIT-NEXT: stw 7, 12(5)
-; LE-32BIT-NEXT: stw 6, 4(5)
-; LE-32BIT-NEXT: addi 1, 1, 80
+; LE-32BIT-NEXT: or 4, 9, 0
+; LE-32BIT-NEXT: stw 4, 16(5)
+; LE-32BIT-NEXT: or 4, 25, 26
+; LE-32BIT-NEXT: stw 4, 20(5)
+; LE-32BIT-NEXT: or 4, 7, 10
+; LE-32BIT-NEXT: or 3, 3, 8
+; LE-32BIT-NEXT: stw 4, 8(5)
+; LE-32BIT-NEXT: or 4, 27, 28
+; LE-32BIT-NEXT: stw 3, 0(5)
+; LE-32BIT-NEXT: or 3, 29, 30
+; LE-32BIT-NEXT: stw 4, 12(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: addi 1, 1, 112
; LE-32BIT-NEXT: blr
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -639,26 +800,40 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-LABEL: ashr_32bytes:
; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: ld 6, 24(3)
; LE-64BIT-NEXT: lxvd2x 0, 0, 3
-; LE-64BIT-NEXT: ld 6, 16(3)
-; LE-64BIT-NEXT: ld 3, 24(3)
-; LE-64BIT-NEXT: addi 7, 1, -64
; LE-64BIT-NEXT: lwz 4, 0(4)
-; LE-64BIT-NEXT: li 8, 16
-; LE-64BIT-NEXT: std 3, 24(7)
-; LE-64BIT-NEXT: sradi 3, 3, 63
-; LE-64BIT-NEXT: std 6, 16(7)
-; LE-64BIT-NEXT: std 3, 56(7)
-; LE-64BIT-NEXT: std 3, 48(7)
-; LE-64BIT-NEXT: std 3, 40(7)
-; LE-64BIT-NEXT: std 3, 32(7)
-; LE-64BIT-NEXT: clrldi 3, 4, 59
+; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: ld 3, 16(3)
+; LE-64BIT-NEXT: sradi 8, 6, 63
+; LE-64BIT-NEXT: rlwinm 9, 4, 0, 27, 28
; LE-64BIT-NEXT: stxvd2x 0, 0, 7
-; LE-64BIT-NEXT: lxvd2x 0, 7, 3
-; LE-64BIT-NEXT: add 3, 7, 3
-; LE-64BIT-NEXT: lxvd2x 1, 3, 8
-; LE-64BIT-NEXT: stxvd2x 1, 5, 8
-; LE-64BIT-NEXT: stxvd2x 0, 0, 5
+; LE-64BIT-NEXT: std 6, -40(1)
+; LE-64BIT-NEXT: std 3, -48(1)
+; LE-64BIT-NEXT: std 8, -8(1)
+; LE-64BIT-NEXT: std 8, -16(1)
+; LE-64BIT-NEXT: std 8, -24(1)
+; LE-64BIT-NEXT: std 8, -32(1)
+; LE-64BIT-NEXT: rlwinm 3, 4, 3, 26, 28
+; LE-64BIT-NEXT: ldux 4, 9, 7
+; LE-64BIT-NEXT: ld 7, 8(9)
+; LE-64BIT-NEXT: subfic 6, 3, 64
+; LE-64BIT-NEXT: ld 8, 16(9)
+; LE-64BIT-NEXT: ld 9, 24(9)
+; LE-64BIT-NEXT: srd 4, 4, 3
+; LE-64BIT-NEXT: sld 10, 7, 6
+; LE-64BIT-NEXT: sld 11, 9, 6
+; LE-64BIT-NEXT: srd 7, 7, 3
+; LE-64BIT-NEXT: sld 6, 8, 6
+; LE-64BIT-NEXT: or 4, 10, 4
+; LE-64BIT-NEXT: srd 10, 8, 3
+; LE-64BIT-NEXT: srad 3, 9, 3
+; LE-64BIT-NEXT: or 6, 6, 7
+; LE-64BIT-NEXT: std 3, 24(5)
+; LE-64BIT-NEXT: or 3, 11, 10
+; LE-64BIT-NEXT: std 6, 8(5)
+; LE-64BIT-NEXT: std 4, 0(5)
+; LE-64BIT-NEXT: std 3, 16(5)
; LE-64BIT-NEXT: blr
;
; BE-LABEL: ashr_32bytes:
@@ -668,79 +843,126 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; BE-NEXT: ld 9, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 6, 1, -64
-; BE-NEXT: std 3, 56(6)
+; BE-NEXT: addi 6, 1, -32
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: std 7, -32(1)
; BE-NEXT: sradi 3, 7, 63
-; BE-NEXT: clrlwi 4, 4, 27
-; BE-NEXT: std 3, 24(6)
-; BE-NEXT: std 3, 16(6)
-; BE-NEXT: std 3, 8(6)
+; BE-NEXT: rlwinm 7, 4, 0, 27, 28
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 3, -48(1)
+; BE-NEXT: std 3, -56(1)
; BE-NEXT: std 3, -64(1)
-; BE-NEXT: neg 3, 4
-; BE-NEXT: std 9, 48(6)
-; BE-NEXT: std 8, 40(6)
-; BE-NEXT: std 7, 32(6)
+; BE-NEXT: neg 3, 7
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 8, -24(1)
; BE-NEXT: extsw 3, 3
-; BE-NEXT: addi 4, 1, -32
-; BE-NEXT: ldux 3, 4, 3
-; BE-NEXT: ld 6, 8(4)
-; BE-NEXT: ld 7, 24(4)
-; BE-NEXT: ld 4, 16(4)
+; BE-NEXT: ldux 3, 6, 3
+; BE-NEXT: rlwinm 4, 4, 3, 26, 28
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 8(6)
+; BE-NEXT: ld 8, 24(6)
+; BE-NEXT: ld 6, 16(6)
+; BE-NEXT: sld 10, 3, 9
+; BE-NEXT: srad 3, 3, 4
; BE-NEXT: std 3, 0(5)
-; BE-NEXT: std 4, 16(5)
-; BE-NEXT: std 7, 24(5)
-; BE-NEXT: std 6, 8(5)
+; BE-NEXT: srd 11, 7, 4
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: sld 7, 7, 9
+; BE-NEXT: sld 9, 6, 9
+; BE-NEXT: srd 6, 6, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 8, 9, 8
+; BE-NEXT: or 6, 7, 6
+; BE-NEXT: std 6, 16(5)
+; BE-NEXT: std 8, 24(5)
+; BE-NEXT: std 10, 8(5)
; BE-NEXT: blr
;
; LE-32BIT-LABEL: ashr_32bytes:
; LE-32BIT: # %bb.0:
-; LE-32BIT-NEXT: stwu 1, -80(1)
-; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: stwu 1, -112(1)
+; LE-32BIT-NEXT: lwz 7, 8(3)
; LE-32BIT-NEXT: addi 6, 1, 48
-; LE-32BIT-NEXT: lwz 8, 4(3)
-; LE-32BIT-NEXT: lwz 9, 8(3)
-; LE-32BIT-NEXT: lwz 10, 12(3)
-; LE-32BIT-NEXT: lwz 11, 16(3)
-; LE-32BIT-NEXT: lwz 12, 20(3)
-; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 8, 16(3)
+; LE-32BIT-NEXT: lwz 9, 24(3)
+; LE-32BIT-NEXT: lwz 10, 0(3)
+; LE-32BIT-NEXT: lwz 11, 4(3)
+; LE-32BIT-NEXT: lwz 12, 12(3)
+; LE-32BIT-NEXT: lwz 0, 20(3)
; LE-32BIT-NEXT: lwz 3, 28(3)
; LE-32BIT-NEXT: lwz 4, 28(4)
; LE-32BIT-NEXT: stw 3, 76(1)
-; LE-32BIT-NEXT: srawi 3, 7, 31
-; LE-32BIT-NEXT: clrlwi 4, 4, 27
-; LE-32BIT-NEXT: stw 0, 72(1)
-; LE-32BIT-NEXT: stw 12, 68(1)
-; LE-32BIT-NEXT: stw 11, 64(1)
-; LE-32BIT-NEXT: stw 10, 60(1)
-; LE-32BIT-NEXT: stw 9, 56(1)
-; LE-32BIT-NEXT: stw 8, 52(1)
-; LE-32BIT-NEXT: stw 7, 48(1)
+; LE-32BIT-NEXT: srawi 3, 10, 31
+; LE-32BIT-NEXT: stw 7, 56(1)
+; LE-32BIT-NEXT: rlwinm 7, 4, 0, 27, 29
+; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 0, 68(1)
+; LE-32BIT-NEXT: subfic 0, 4, 32
+; LE-32BIT-NEXT: stw 12, 60(1)
+; LE-32BIT-NEXT: stw 11, 52(1)
+; LE-32BIT-NEXT: stw 9, 72(1)
+; LE-32BIT-NEXT: stw 8, 64(1)
+; LE-32BIT-NEXT: stw 10, 48(1)
; LE-32BIT-NEXT: stw 3, 44(1)
-; LE-32BIT-NEXT: stw 3, 40(1)
; LE-32BIT-NEXT: stw 3, 36(1)
-; LE-32BIT-NEXT: stw 3, 32(1)
; LE-32BIT-NEXT: stw 3, 28(1)
-; LE-32BIT-NEXT: stw 3, 24(1)
; LE-32BIT-NEXT: stw 3, 20(1)
+; LE-32BIT-NEXT: stw 3, 40(1)
+; LE-32BIT-NEXT: stw 3, 32(1)
+; LE-32BIT-NEXT: stw 3, 24(1)
; LE-32BIT-NEXT: stw 3, 16(1)
-; LE-32BIT-NEXT: sub 3, 6, 4
-; LE-32BIT-NEXT: lwz 4, 4(3)
-; LE-32BIT-NEXT: lwz 6, 0(3)
-; LE-32BIT-NEXT: lwz 7, 12(3)
-; LE-32BIT-NEXT: lwz 8, 8(3)
-; LE-32BIT-NEXT: lwz 9, 20(3)
-; LE-32BIT-NEXT: lwz 10, 16(3)
-; LE-32BIT-NEXT: lwz 11, 24(3)
-; LE-32BIT-NEXT: lwz 3, 28(3)
-; LE-32BIT-NEXT: stw 11, 24(5)
+; LE-32BIT-NEXT: sub 3, 6, 7
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 30, 6, 4
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: slw 29, 7, 0
+; LE-32BIT-NEXT: lwz 10, 20(3)
+; LE-32BIT-NEXT: srw 28, 8, 4
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: slw 27, 9, 0
+; LE-32BIT-NEXT: lwz 12, 28(3)
+; LE-32BIT-NEXT: slw 6, 6, 0
+; LE-32BIT-NEXT: lwz 3, 24(3)
+; LE-32BIT-NEXT: srw 26, 10, 4
+; LE-32BIT-NEXT: slw 25, 11, 0
+; LE-32BIT-NEXT: slw 8, 8, 0
+; LE-32BIT-NEXT: slw 10, 10, 0
+; LE-32BIT-NEXT: slw 0, 3, 0
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: srw 12, 12, 4
+; LE-32BIT-NEXT: or 3, 10, 3
+; LE-32BIT-NEXT: srw 11, 11, 4
+; LE-32BIT-NEXT: stw 3, 24(5)
+; LE-32BIT-NEXT: or 3, 0, 12
; LE-32BIT-NEXT: stw 3, 28(5)
-; LE-32BIT-NEXT: stw 10, 16(5)
-; LE-32BIT-NEXT: stw 9, 20(5)
-; LE-32BIT-NEXT: stw 8, 8(5)
-; LE-32BIT-NEXT: stw 7, 12(5)
-; LE-32BIT-NEXT: stw 6, 0(5)
-; LE-32BIT-NEXT: stw 4, 4(5)
-; LE-32BIT-NEXT: addi 1, 1, 80
+; LE-32BIT-NEXT: or 3, 8, 11
+; LE-32BIT-NEXT: srw 9, 9, 4
+; LE-32BIT-NEXT: stw 3, 16(5)
+; LE-32BIT-NEXT: or 3, 25, 26
+; LE-32BIT-NEXT: stw 3, 20(5)
+; LE-32BIT-NEXT: or 3, 6, 9
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 27, 28
+; LE-32BIT-NEXT: sraw 4, 7, 4
+; LE-32BIT-NEXT: stw 3, 12(5)
+; LE-32BIT-NEXT: or 3, 29, 30
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: addi 1, 1, 112
; LE-32BIT-NEXT: blr
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
diff --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
index 98c76a7d3887c..f0fe0765ce19b 100644
--- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
@@ -209,20 +209,20 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stwu 1, -48(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
; LE-32BIT-NEXT: li 6, 0
-; LE-32BIT-NEXT: lwz 8, 4(3)
-; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 8, 8(3)
+; LE-32BIT-NEXT: lwz 9, 4(3)
; LE-32BIT-NEXT: lwz 3, 12(3)
; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: stw 6, 28(1)
-; LE-32BIT-NEXT: stw 6, 24(1)
; LE-32BIT-NEXT: stw 6, 20(1)
+; LE-32BIT-NEXT: stw 6, 24(1)
; LE-32BIT-NEXT: stw 6, 16(1)
; LE-32BIT-NEXT: rlwinm 6, 4, 29, 28, 29
; LE-32BIT-NEXT: stw 3, 44(1)
; LE-32BIT-NEXT: addi 3, 1, 32
-; LE-32BIT-NEXT: stw 9, 40(1)
+; LE-32BIT-NEXT: stw 9, 36(1)
; LE-32BIT-NEXT: sub 3, 3, 6
-; LE-32BIT-NEXT: stw 8, 36(1)
+; LE-32BIT-NEXT: stw 8, 40(1)
; LE-32BIT-NEXT: clrlwi 4, 4, 27
; LE-32BIT-NEXT: stw 7, 32(1)
; LE-32BIT-NEXT: subfic 9, 4, 32
@@ -290,24 +290,24 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-LABEL: shl_16bytes:
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -48(1)
-; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 7, 4(3)
; LE-32BIT-NEXT: li 6, 0
-; LE-32BIT-NEXT: lwz 8, 4(3)
-; LE-32BIT-NEXT: lwz 9, 8(3)
-; LE-32BIT-NEXT: lwz 3, 12(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: lwz 9, 0(3)
+; LE-32BIT-NEXT: lwz 3, 8(3)
; LE-32BIT-NEXT: lwz 4, 12(4)
-; LE-32BIT-NEXT: stw 6, 44(1)
; LE-32BIT-NEXT: stw 6, 40(1)
-; LE-32BIT-NEXT: stw 6, 36(1)
; LE-32BIT-NEXT: stw 6, 32(1)
+; LE-32BIT-NEXT: stw 6, 44(1)
+; LE-32BIT-NEXT: stw 6, 36(1)
; LE-32BIT-NEXT: rlwinm 6, 4, 29, 28, 29
-; LE-32BIT-NEXT: stw 3, 28(1)
+; LE-32BIT-NEXT: stw 3, 24(1)
; LE-32BIT-NEXT: addi 3, 1, 16
-; LE-32BIT-NEXT: stw 9, 24(1)
+; LE-32BIT-NEXT: stw 9, 16(1)
; LE-32BIT-NEXT: clrlwi 4, 4, 27
-; LE-32BIT-NEXT: stw 8, 20(1)
+; LE-32BIT-NEXT: stw 8, 28(1)
; LE-32BIT-NEXT: subfic 8, 4, 32
-; LE-32BIT-NEXT: stw 7, 16(1)
+; LE-32BIT-NEXT: stw 7, 20(1)
; LE-32BIT-NEXT: lwzux 3, 6, 3
; LE-32BIT-NEXT: lwz 9, 4(6)
; LE-32BIT-NEXT: slw 3, 3, 4
@@ -378,23 +378,23 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-LABEL: ashr_16bytes:
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -48(1)
-; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 7, 8(3)
; LE-32BIT-NEXT: addi 6, 1, 32
-; LE-32BIT-NEXT: lwz 8, 4(3)
-; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 8, 0(3)
+; LE-32BIT-NEXT: lwz 9, 4(3)
; LE-32BIT-NEXT: lwz 3, 12(3)
; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: stw 3, 44(1)
-; LE-32BIT-NEXT: srawi 3, 7, 31
-; LE-32BIT-NEXT: stw 7, 32(1)
+; LE-32BIT-NEXT: srawi 3, 8, 31
+; LE-32BIT-NEXT: stw 7, 40(1)
; LE-32BIT-NEXT: rlwinm 7, 4, 29, 28, 29
-; LE-32BIT-NEXT: stw 9, 40(1)
+; LE-32BIT-NEXT: stw 9, 36(1)
; LE-32BIT-NEXT: clrlwi 4, 4, 27
-; LE-32BIT-NEXT: stw 8, 36(1)
+; LE-32BIT-NEXT: stw 8, 32(1)
; LE-32BIT-NEXT: subfic 9, 4, 32
; LE-32BIT-NEXT: stw 3, 28(1)
-; LE-32BIT-NEXT: stw 3, 24(1)
; LE-32BIT-NEXT: stw 3, 20(1)
+; LE-32BIT-NEXT: stw 3, 24(1)
; LE-32BIT-NEXT: stw 3, 16(1)
; LE-32BIT-NEXT: sub 3, 6, 7
; LE-32BIT-NEXT: lwz 6, 4(3)
@@ -511,25 +511,25 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stwu 1, -112(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
; LE-32BIT-NEXT: li 6, 0
-; LE-32BIT-NEXT: lwz 8, 4(3)
-; LE-32BIT-NEXT: lwz 9, 8(3)
-; LE-32BIT-NEXT: lwz 10, 12(3)
-; LE-32BIT-NEXT: lwz 11, 16(3)
-; LE-32BIT-NEXT: lwz 12, 20(3)
-; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 8, 8(3)
+; LE-32BIT-NEXT: lwz 9, 16(3)
+; LE-32BIT-NEXT: lwz 10, 24(3)
+; LE-32BIT-NEXT: lwz 11, 4(3)
+; LE-32BIT-NEXT: lwz 12, 12(3)
+; LE-32BIT-NEXT: lwz 0, 20(3)
; LE-32BIT-NEXT: lwz 3, 28(3)
; LE-32BIT-NEXT: lwz 4, 28(4)
-; LE-32BIT-NEXT: stw 6, 48(1)
; LE-32BIT-NEXT: stw 6, 44(1)
-; LE-32BIT-NEXT: stw 6, 40(1)
; LE-32BIT-NEXT: stw 6, 36(1)
-; LE-32BIT-NEXT: stw 6, 32(1)
; LE-32BIT-NEXT: stw 6, 28(1)
-; LE-32BIT-NEXT: stw 6, 24(1)
; LE-32BIT-NEXT: stw 6, 20(1)
+; LE-32BIT-NEXT: stw 6, 40(1)
+; LE-32BIT-NEXT: stw 6, 32(1)
+; LE-32BIT-NEXT: stw 6, 24(1)
+; LE-32BIT-NEXT: stw 6, 16(1)
; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 29
-; LE-32BIT-NEXT: stw 3, 80(1)
-; LE-32BIT-NEXT: addi 3, 1, 52
+; LE-32BIT-NEXT: stw 3, 76(1)
+; LE-32BIT-NEXT: addi 3, 1, 48
; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: sub 3, 3, 6
; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
@@ -538,54 +538,52 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT: subfic 30, 4, 32
-; LE-32BIT-NEXT: stw 0, 76(1)
-; LE-32BIT-NEXT: stw 12, 72(1)
-; LE-32BIT-NEXT: xori 12, 4, 31
-; LE-32BIT-NEXT: stw 11, 68(1)
-; LE-32BIT-NEXT: stw 10, 64(1)
-; LE-32BIT-NEXT: stw 9, 60(1)
+; LE-32BIT-NEXT: stw 0, 68(1)
+; LE-32BIT-NEXT: subfic 0, 4, 32
+; LE-32BIT-NEXT: stw 12, 60(1)
+; LE-32BIT-NEXT: stw 11, 52(1)
+; LE-32BIT-NEXT: stw 10, 72(1)
+; LE-32BIT-NEXT: stw 9, 64(1)
; LE-32BIT-NEXT: stw 8, 56(1)
-; LE-32BIT-NEXT: stw 7, 52(1)
-; LE-32BIT-NEXT: lwz 6, 8(3)
-; LE-32BIT-NEXT: lwz 7, 4(3)
-; LE-32BIT-NEXT: lwz 8, 0(3)
-; LE-32BIT-NEXT: srw 29, 6, 4
-; LE-32BIT-NEXT: lwz 9, 12(3)
-; LE-32BIT-NEXT: slw 6, 6, 30
+; LE-32BIT-NEXT: stw 7, 48(1)
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 30, 6, 4
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: slw 29, 7, 0
; LE-32BIT-NEXT: lwz 10, 20(3)
-; LE-32BIT-NEXT: slw 28, 8, 30
+; LE-32BIT-NEXT: srw 28, 8, 4
; LE-32BIT-NEXT: lwz 11, 16(3)
-; LE-32BIT-NEXT: srw 27, 9, 4
-; LE-32BIT-NEXT: lwz 0, 28(3)
-; LE-32BIT-NEXT: srw 26, 10, 4
+; LE-32BIT-NEXT: slw 27, 9, 0
+; LE-32BIT-NEXT: lwz 12, 28(3)
+; LE-32BIT-NEXT: slw 6, 6, 0
; LE-32BIT-NEXT: lwz 3, 24(3)
-; LE-32BIT-NEXT: slw 25, 11, 30
-; LE-32BIT-NEXT: slw 9, 9, 30
-; LE-32BIT-NEXT: slw 10, 10, 30
-; LE-32BIT-NEXT: slw 30, 3, 30
+; LE-32BIT-NEXT: srw 26, 10, 4
+; LE-32BIT-NEXT: slw 25, 11, 0
+; LE-32BIT-NEXT: slw 8, 8, 0
+; LE-32BIT-NEXT: slw 10, 10, 0
+; LE-32BIT-NEXT: slw 0, 3, 0
; LE-32BIT-NEXT: srw 3, 3, 4
-; LE-32BIT-NEXT: srw 0, 0, 4
+; LE-32BIT-NEXT: srw 12, 12, 4
; LE-32BIT-NEXT: or 3, 10, 3
; LE-32BIT-NEXT: srw 11, 11, 4
; LE-32BIT-NEXT: stw 3, 24(5)
-; LE-32BIT-NEXT: or 3, 30, 0
+; LE-32BIT-NEXT: or 3, 0, 12
; LE-32BIT-NEXT: stw 3, 28(5)
-; LE-32BIT-NEXT: or 3, 9, 11
+; LE-32BIT-NEXT: or 3, 8, 11
+; LE-32BIT-NEXT: srw 9, 9, 4
; LE-32BIT-NEXT: stw 3, 16(5)
; LE-32BIT-NEXT: or 3, 25, 26
-; LE-32BIT-NEXT: srw 8, 8, 4
-; LE-32BIT-NEXT: srw 4, 7, 4
-; LE-32BIT-NEXT: slwi 7, 7, 1
; LE-32BIT-NEXT: stw 3, 20(5)
-; LE-32BIT-NEXT: or 3, 6, 27
-; LE-32BIT-NEXT: slw 7, 7, 12
+; LE-32BIT-NEXT: or 3, 6, 9
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 27, 28
+; LE-32BIT-NEXT: srw 4, 7, 4
; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: or 3, 28, 4
+; LE-32BIT-NEXT: or 3, 29, 30
+; LE-32BIT-NEXT: stw 4, 0(5)
; LE-32BIT-NEXT: stw 3, 4(5)
-; LE-32BIT-NEXT: or 3, 29, 7
-; LE-32BIT-NEXT: stw 8, 0(5)
-; LE-32BIT-NEXT: stw 3, 8(5)
; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
; LE-32BIT-NEXT: lwz 28, 96(1) # 4-byte Folded Reload
@@ -684,15 +682,15 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-LABEL: shl_32bytes:
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -112(1)
-; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 7, 4(3)
; LE-32BIT-NEXT: li 6, 0
-; LE-32BIT-NEXT: lwz 8, 4(3)
-; LE-32BIT-NEXT: lwz 9, 8(3)
-; LE-32BIT-NEXT: lwz 10, 12(3)
-; LE-32BIT-NEXT: lwz 11, 16(3)
-; LE-32BIT-NEXT: lwz 12, 20(3)
-; LE-32BIT-NEXT: lwz 0, 24(3)
-; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: lwz 9, 20(3)
+; LE-32BIT-NEXT: lwz 10, 28(3)
+; LE-32BIT-NEXT: lwz 11, 0(3)
+; LE-32BIT-NEXT: lwz 12, 8(3)
+; LE-32BIT-NEXT: lwz 0, 16(3)
+; LE-32BIT-NEXT: lwz 3, 24(3)
; LE-32BIT-NEXT: lwz 4, 28(4)
; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
@@ -700,25 +698,25 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT: stw 6, 80(1)
-; LE-32BIT-NEXT: stw 6, 76(1)
; LE-32BIT-NEXT: stw 6, 72(1)
-; LE-32BIT-NEXT: stw 6, 68(1)
; LE-32BIT-NEXT: stw 6, 64(1)
-; LE-32BIT-NEXT: stw 6, 60(1)
; LE-32BIT-NEXT: stw 6, 56(1)
+; LE-32BIT-NEXT: stw 6, 48(1)
+; LE-32BIT-NEXT: stw 6, 76(1)
+; LE-32BIT-NEXT: stw 6, 68(1)
+; LE-32BIT-NEXT: stw 6, 60(1)
; LE-32BIT-NEXT: stw 6, 52(1)
; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 29
-; LE-32BIT-NEXT: stw 3, 48(1)
-; LE-32BIT-NEXT: addi 3, 1, 20
-; LE-32BIT-NEXT: stw 0, 44(1)
+; LE-32BIT-NEXT: stw 3, 40(1)
+; LE-32BIT-NEXT: addi 3, 1, 16
+; LE-32BIT-NEXT: stw 0, 32(1)
; LE-32BIT-NEXT: clrlwi 4, 4, 27
-; LE-32BIT-NEXT: stw 12, 40(1)
+; LE-32BIT-NEXT: stw 12, 24(1)
; LE-32BIT-NEXT: subfic 12, 4, 32
-; LE-32BIT-NEXT: stw 11, 36(1)
-; LE-32BIT-NEXT: stw 10, 32(1)
-; LE-32BIT-NEXT: stw 9, 28(1)
-; LE-32BIT-NEXT: stw 8, 24(1)
+; LE-32BIT-NEXT: stw 11, 16(1)
+; LE-32BIT-NEXT: stw 10, 44(1)
+; LE-32BIT-NEXT: stw 9, 36(1)
+; LE-32BIT-NEXT: stw 8, 28(1)
; LE-32BIT-NEXT: stw 7, 20(1)
; LE-32BIT-NEXT: lwzux 3, 6, 3
; LE-32BIT-NEXT: lwz 7, 8(6)
@@ -858,19 +856,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-LABEL: ashr_32bytes:
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -112(1)
-; LE-32BIT-NEXT: lwz 7, 0(3)
-; LE-32BIT-NEXT: addi 6, 1, 52
-; LE-32BIT-NEXT: lwz 8, 4(3)
-; LE-32BIT-NEXT: lwz 9, 8(3)
-; LE-32BIT-NEXT: lwz 10, 12(3)
-; LE-32BIT-NEXT: lwz 11, 16(3)
-; LE-32BIT-NEXT: lwz 12, 20(3)
-; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 7, 8(3)
+; LE-32BIT-NEXT: addi 6, 1, 48
+; LE-32BIT-NEXT: lwz 8, 16(3)
+; LE-32BIT-NEXT: lwz 9, 24(3)
+; LE-32BIT-NEXT: lwz 10, 0(3)
+; LE-32BIT-NEXT: lwz 11, 4(3)
+; LE-32BIT-NEXT: lwz 12, 12(3)
+; LE-32BIT-NEXT: lwz 0, 20(3)
; LE-32BIT-NEXT: lwz 3, 28(3)
; LE-32BIT-NEXT: lwz 4, 28(4)
-; LE-32BIT-NEXT: stw 3, 80(1)
-; LE-32BIT-NEXT: srawi 3, 7, 31
-; LE-32BIT-NEXT: stw 7, 52(1)
+; LE-32BIT-NEXT: stw 3, 76(1)
+; LE-32BIT-NEXT: srawi 3, 10, 31
+; LE-32BIT-NEXT: stw 7, 56(1)
; LE-32BIT-NEXT: rlwinm 7, 4, 29, 27, 29
; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: clrlwi 4, 4, 27
@@ -879,62 +877,60 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT: subfic 30, 4, 32
-; LE-32BIT-NEXT: stw 0, 76(1)
-; LE-32BIT-NEXT: stw 12, 72(1)
-; LE-32BIT-NEXT: xori 12, 4, 31
-; LE-32BIT-NEXT: stw 11, 68(1)
-; LE-32BIT-NEXT: stw 10, 64(1)
-; LE-32BIT-NEXT: stw 9, 60(1)
-; LE-32BIT-NEXT: stw 8, 56(1)
-; LE-32BIT-NEXT: stw 3, 48(1)
+; LE-32BIT-NEXT: stw 0, 68(1)
+; LE-32BIT-NEXT: subfic 0, 4, 32
+; LE-32BIT-NEXT: stw 12, 60(1)
+; LE-32BIT-NEXT: stw 11, 52(1)
+; LE-32BIT-NEXT: stw 9, 72(1)
+; LE-32BIT-NEXT: stw 8, 64(1)
+; LE-32BIT-NEXT: stw 10, 48(1)
; LE-32BIT-NEXT: stw 3, 44(1)
-; LE-32BIT-NEXT: stw 3, 40(1)
; LE-32BIT-NEXT: stw 3, 36(1)
-; LE-32BIT-NEXT: stw 3, 32(1)
; LE-32BIT-NEXT: stw 3, 28(1)
-; LE-32BIT-NEXT: stw 3, 24(1)
; LE-32BIT-NEXT: stw 3, 20(1)
+; LE-32BIT-NEXT: stw 3, 40(1)
+; LE-32BIT-NEXT: stw 3, 32(1)
+; LE-32BIT-NEXT: stw 3, 24(1)
+; LE-32BIT-NEXT: stw 3, 16(1)
; LE-32BIT-NEXT: sub 3, 6, 7
-; LE-32BIT-NEXT: lwz 6, 8(3)
-; LE-32BIT-NEXT: lwz 7, 4(3)
-; LE-32BIT-NEXT: lwz 8, 0(3)
-; LE-32BIT-NEXT: srw 29, 6, 4
-; LE-32BIT-NEXT: lwz 9, 12(3)
-; LE-32BIT-NEXT: slw 6, 6, 30
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 30, 6, 4
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: slw 29, 7, 0
; LE-32BIT-NEXT: lwz 10, 20(3)
-; LE-32BIT-NEXT: slw 28, 8, 30
+; LE-32BIT-NEXT: srw 28, 8, 4
; LE-32BIT-NEXT: lwz 11, 16(3)
-; LE-32BIT-NEXT: srw 27, 9, 4
-; LE-32BIT-NEXT: lwz 0, 28(3)
-; LE-32BIT-NEXT: srw 26, 10, 4
+; LE-32BIT-NEXT: slw 27, 9, 0
+; LE-32BIT-NEXT: lwz 12, 28(3)
+; LE-32BIT-NEXT: slw 6, 6, 0
; LE-32BIT-NEXT: lwz 3, 24(3)
-; LE-32BIT-NEXT: slw 25, 11, 30
-; LE-32BIT-NEXT: slw 9, 9, 30
-; LE-32BIT-NEXT: slw 10, 10, 30
-; LE-32BIT-NEXT: slw 30, 3, 30
+; LE-32BIT-NEXT: srw 26, 10, 4
+; LE-32BIT-NEXT: slw 25, 11, 0
+; LE-32BIT-NEXT: slw 8, 8, 0
+; LE-32BIT-NEXT: slw 10, 10, 0
+; LE-32BIT-NEXT: slw 0, 3, 0
; LE-32BIT-NEXT: srw 3, 3, 4
-; LE-32BIT-NEXT: srw 0, 0, 4
+; LE-32BIT-NEXT: srw 12, 12, 4
; LE-32BIT-NEXT: or 3, 10, 3
; LE-32BIT-NEXT: srw 11, 11, 4
; LE-32BIT-NEXT: stw 3, 24(5)
-; LE-32BIT-NEXT: or 3, 30, 0
+; LE-32BIT-NEXT: or 3, 0, 12
; LE-32BIT-NEXT: stw 3, 28(5)
-; LE-32BIT-NEXT: or 3, 9, 11
+; LE-32BIT-NEXT: or 3, 8, 11
+; LE-32BIT-NEXT: srw 9, 9, 4
; LE-32BIT-NEXT: stw 3, 16(5)
; LE-32BIT-NEXT: or 3, 25, 26
-; LE-32BIT-NEXT: sraw 8, 8, 4
-; LE-32BIT-NEXT: srw 4, 7, 4
-; LE-32BIT-NEXT: slwi 7, 7, 1
; LE-32BIT-NEXT: stw 3, 20(5)
-; LE-32BIT-NEXT: or 3, 6, 27
-; LE-32BIT-NEXT: slw 7, 7, 12
+; LE-32BIT-NEXT: or 3, 6, 9
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 27, 28
+; LE-32BIT-NEXT: sraw 4, 7, 4
; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: or 3, 28, 4
+; LE-32BIT-NEXT: or 3, 29, 30
+; LE-32BIT-NEXT: stw 4, 0(5)
; LE-32BIT-NEXT: stw 3, 4(5)
-; LE-32BIT-NEXT: or 3, 29, 7
-; LE-32BIT-NEXT: stw 8, 0(5)
-; LE-32BIT-NEXT: stw 3, 8(5)
; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
; LE-32BIT-NEXT: lwz 28, 96(1) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 5ba8755201ddf..19854afba772c 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -153,18 +153,18 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: addi sp, sp, -32
; RV32I-NEXT: lw a2, 0(a2)
-; RV32I-NEXT: lw a3, 0(a1)
-; RV32I-NEXT: lw a4, 4(a1)
+; RV32I-NEXT: lw a3, 4(a1)
+; RV32I-NEXT: lw a4, 12(a1)
; RV32I-NEXT: lw a5, 8(a1)
-; RV32I-NEXT: lw a1, 12(a1)
-; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: lw a1, 0(a1)
; RV32I-NEXT: sw zero, 24(sp)
-; RV32I-NEXT: sw zero, 20(sp)
; RV32I-NEXT: sw zero, 16(sp)
-; RV32I-NEXT: sw a1, 12(sp)
; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a4, 4(sp)
-; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a1, 0(sp)
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw a4, 12(sp)
+; RV32I-NEXT: sw a3, 4(sp)
; RV32I-NEXT: srli a1, a2, 3
; RV32I-NEXT: andi a1, a1, 12
; RV32I-NEXT: mv a3, sp
@@ -222,19 +222,19 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: addi sp, sp, -32
; RV32I-NEXT: lw a2, 0(a2)
-; RV32I-NEXT: lw a3, 12(a1)
-; RV32I-NEXT: lw a4, 8(a1)
-; RV32I-NEXT: lw a5, 4(a1)
-; RV32I-NEXT: lw a1, 0(a1)
-; RV32I-NEXT: sw a3, 12(sp)
-; RV32I-NEXT: sw a4, 8(sp)
-; RV32I-NEXT: sw a5, 4(sp)
-; RV32I-NEXT: sw a1, 0(sp)
-; RV32I-NEXT: srai a3, a3, 31
-; RV32I-NEXT: sw a3, 28(sp)
+; RV32I-NEXT: lw a3, 8(a1)
+; RV32I-NEXT: lw a4, 0(a1)
+; RV32I-NEXT: lw a5, 12(a1)
+; RV32I-NEXT: lw a1, 4(a1)
+; RV32I-NEXT: sw a3, 8(sp)
+; RV32I-NEXT: sw a4, 0(sp)
+; RV32I-NEXT: srai a3, a5, 31
; RV32I-NEXT: sw a3, 24(sp)
-; RV32I-NEXT: sw a3, 20(sp)
; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw a1, 4(sp)
+; RV32I-NEXT: sw a3, 28(sp)
+; RV32I-NEXT: sw a3, 20(sp)
; RV32I-NEXT: srli a1, a2, 3
; RV32I-NEXT: andi a1, a1, 12
; RV32I-NEXT: mv a3, sp
@@ -293,16 +293,16 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: addi sp, sp, -32
; RV32I-NEXT: lw a2, 0(a2)
; RV32I-NEXT: lw a3, 0(a1)
-; RV32I-NEXT: lw a4, 4(a1)
-; RV32I-NEXT: lw a5, 8(a1)
-; RV32I-NEXT: lw a1, 12(a1)
+; RV32I-NEXT: lw a4, 8(a1)
+; RV32I-NEXT: lw a5, 12(a1)
+; RV32I-NEXT: lw a1, 4(a1)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw a5, 28(sp)
+; RV32I-NEXT: sw a1, 20(sp)
+; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 0(sp)
-; RV32I-NEXT: sw a1, 28(sp)
-; RV32I-NEXT: sw a5, 24(sp)
-; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a4, 24(sp)
; RV32I-NEXT: sw a3, 16(sp)
; RV32I-NEXT: srli a1, a2, 3
; RV32I-NEXT: andi a1, a1, 12
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index 0b87bb05cfd63..9787d17362e9d 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -734,20 +734,20 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: lbu a4, 5(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
-; RV32I-NEXT: lbu a6, 6(a0)
-; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: lbu a4, 9(a0)
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 10(a0)
+; RV32I-NEXT: lbu a7, 11(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a5, a7, a6
; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: lbu a5, 9(a0)
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 10(a0)
-; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: lbu a5, 5(a0)
+; RV32I-NEXT: lbu a6, 4(a0)
+; RV32I-NEXT: lbu a7, 6(a0)
+; RV32I-NEXT: lbu t0, 7(a0)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
@@ -775,12 +775,12 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: or a1, a1, t0
; RV32I-NEXT: or a1, a1, a6
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 20(sp)
-; RV32I-NEXT: sw zero, 16(sp)
; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 4(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw a4, 8(sp)
; RV32I-NEXT: sw a3, 0(sp)
; RV32I-NEXT: andi a0, a1, 12
; RV32I-NEXT: mv a3, sp
@@ -972,20 +972,20 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: lbu a4, 5(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
-; RV32I-NEXT: lbu a6, 6(a0)
-; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: lbu a4, 9(a0)
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 10(a0)
+; RV32I-NEXT: lbu a7, 11(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a5, a7, a6
; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: lbu a5, 9(a0)
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 10(a0)
-; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: lbu a5, 5(a0)
+; RV32I-NEXT: lbu a6, 4(a0)
+; RV32I-NEXT: lbu a7, 6(a0)
+; RV32I-NEXT: lbu t0, 7(a0)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
@@ -1013,12 +1013,12 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: or a1, a1, t0
; RV32I-NEXT: or a1, a1, a6
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 4(sp)
-; RV32I-NEXT: sw zero, 0(sp)
; RV32I-NEXT: sw a0, 28(sp)
-; RV32I-NEXT: sw a5, 24(sp)
-; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a5, 20(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a4, 24(sp)
; RV32I-NEXT: sw a3, 16(sp)
; RV32I-NEXT: andi a0, a1, 12
; RV32I-NEXT: addi a3, sp, 16
@@ -1210,20 +1210,20 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: lbu a4, 5(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
-; RV32I-NEXT: lbu a6, 6(a0)
-; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: lbu a4, 9(a0)
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 10(a0)
+; RV32I-NEXT: lbu a7, 11(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a5, a7, a6
; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: lbu a5, 9(a0)
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 10(a0)
-; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: lbu a5, 5(a0)
+; RV32I-NEXT: lbu a6, 4(a0)
+; RV32I-NEXT: lbu a7, 6(a0)
+; RV32I-NEXT: lbu t0, 7(a0)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
@@ -1252,12 +1252,12 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: or a1, a1, a7
; RV32I-NEXT: srai a0, a0, 31
; RV32I-NEXT: sw a0, 28(sp)
-; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: sw a0, 20(sp)
-; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: sw a6, 12(sp)
-; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 4(sp)
+; RV32I-NEXT: sw a0, 24(sp)
+; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: sw a4, 8(sp)
; RV32I-NEXT: sw a3, 0(sp)
; RV32I-NEXT: andi a0, a1, 12
; RV32I-NEXT: mv a3, sp
@@ -1346,20 +1346,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: lbu a4, 9(a0)
-; RV64I-NEXT: lbu a5, 8(a0)
-; RV64I-NEXT: lbu a6, 10(a0)
-; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: lbu a4, 17(a0)
+; RV64I-NEXT: lbu a5, 16(a0)
+; RV64I-NEXT: lbu a6, 18(a0)
+; RV64I-NEXT: lbu a7, 19(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 13(a0)
-; RV64I-NEXT: lbu a6, 12(a0)
-; RV64I-NEXT: lbu a7, 14(a0)
-; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: lbu a5, 21(a0)
+; RV64I-NEXT: lbu a6, 20(a0)
+; RV64I-NEXT: lbu a7, 22(a0)
+; RV64I-NEXT: lbu t0, 23(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -1368,20 +1368,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 17(a0)
-; RV64I-NEXT: lbu a6, 16(a0)
-; RV64I-NEXT: lbu a7, 18(a0)
-; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: lbu a5, 9(a0)
+; RV64I-NEXT: lbu a6, 8(a0)
+; RV64I-NEXT: lbu a7, 10(a0)
+; RV64I-NEXT: lbu t0, 11(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: slli t0, t0, 24
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 21(a0)
-; RV64I-NEXT: lbu a7, 20(a0)
-; RV64I-NEXT: lbu t0, 22(a0)
-; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu a7, 12(a0)
+; RV64I-NEXT: lbu t0, 14(a0)
+; RV64I-NEXT: lbu t1, 15(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
@@ -1435,12 +1435,12 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: sd zero, 56(sp)
-; RV64I-NEXT: sd zero, 48(sp)
; RV64I-NEXT: sd zero, 40(sp)
-; RV64I-NEXT: sd zero, 32(sp)
; RV64I-NEXT: sd a0, 24(sp)
-; RV64I-NEXT: sd a5, 16(sp)
-; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a5, 8(sp)
+; RV64I-NEXT: sd zero, 48(sp)
+; RV64I-NEXT: sd zero, 32(sp)
+; RV64I-NEXT: sd a4, 16(sp)
; RV64I-NEXT: sd a3, 0(sp)
; RV64I-NEXT: andi a0, a1, 24
; RV64I-NEXT: mv a3, sp
@@ -1544,60 +1544,60 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: lbu a4, 5(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
-; RV32I-NEXT: lbu a6, 6(a0)
-; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: lbu a4, 9(a0)
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 10(a0)
+; RV32I-NEXT: lbu a7, 11(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a5, a7, a6
; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: lbu a5, 9(a0)
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 10(a0)
-; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: lbu a5, 17(a0)
+; RV32I-NEXT: lbu a6, 16(a0)
+; RV32I-NEXT: lbu a7, 18(a0)
+; RV32I-NEXT: lbu t0, 19(a0)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: lbu a6, 13(a0)
-; RV32I-NEXT: lbu a7, 12(a0)
-; RV32I-NEXT: lbu t0, 14(a0)
-; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: lbu a6, 25(a0)
+; RV32I-NEXT: lbu a7, 24(a0)
+; RV32I-NEXT: lbu t0, 26(a0)
+; RV32I-NEXT: lbu t1, 27(a0)
; RV32I-NEXT: slli a6, a6, 8
; RV32I-NEXT: or a6, a6, a7
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli t1, t1, 24
; RV32I-NEXT: or a7, t1, t0
; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: lbu a7, 17(a0)
-; RV32I-NEXT: lbu t0, 16(a0)
-; RV32I-NEXT: lbu t1, 18(a0)
-; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: lbu a7, 5(a0)
+; RV32I-NEXT: lbu t0, 4(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t0
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
; RV32I-NEXT: or t0, t2, t1
; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: lbu t0, 21(a0)
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 22(a0)
-; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: lbu t0, 13(a0)
+; RV32I-NEXT: lbu t1, 12(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu t3, 15(a0)
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: or t0, t0, t1
; RV32I-NEXT: slli t2, t2, 16
; RV32I-NEXT: slli t3, t3, 24
; RV32I-NEXT: or t1, t3, t2
; RV32I-NEXT: or t0, t1, t0
-; RV32I-NEXT: lbu t1, 25(a0)
-; RV32I-NEXT: lbu t2, 24(a0)
-; RV32I-NEXT: lbu t3, 26(a0)
-; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: lbu t1, 21(a0)
+; RV32I-NEXT: lbu t2, 20(a0)
+; RV32I-NEXT: lbu t3, 22(a0)
+; RV32I-NEXT: lbu t4, 23(a0)
; RV32I-NEXT: slli t1, t1, 8
; RV32I-NEXT: or t1, t1, t2
; RV32I-NEXT: slli t3, t3, 16
@@ -1624,24 +1624,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a1, a1, 24
; RV32I-NEXT: or a1, a1, t4
; RV32I-NEXT: or a1, a1, t2
-; RV32I-NEXT: sw zero, 64(sp)
; RV32I-NEXT: sw zero, 60(sp)
-; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 52(sp)
-; RV32I-NEXT: sw zero, 48(sp)
; RV32I-NEXT: sw zero, 44(sp)
-; RV32I-NEXT: sw zero, 40(sp)
; RV32I-NEXT: sw zero, 36(sp)
-; RV32I-NEXT: sw a0, 32(sp)
-; RV32I-NEXT: sw t1, 28(sp)
-; RV32I-NEXT: sw t0, 24(sp)
-; RV32I-NEXT: sw a7, 20(sp)
-; RV32I-NEXT: sw a6, 16(sp)
-; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw t1, 20(sp)
+; RV32I-NEXT: sw t0, 12(sp)
+; RV32I-NEXT: sw a7, 4(sp)
+; RV32I-NEXT: sw zero, 56(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 40(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw a6, 24(sp)
+; RV32I-NEXT: sw a5, 16(sp)
; RV32I-NEXT: sw a4, 8(sp)
-; RV32I-NEXT: sw a3, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
; RV32I-NEXT: andi a0, a1, 28
-; RV32I-NEXT: addi a3, sp, 4
+; RV32I-NEXT: mv a3, sp
; RV32I-NEXT: add a5, a3, a0
; RV32I-NEXT: lw a3, 4(a5)
; RV32I-NEXT: slli a6, a1, 3
@@ -1777,20 +1777,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: lbu a4, 9(a0)
-; RV64I-NEXT: lbu a5, 8(a0)
-; RV64I-NEXT: lbu a6, 10(a0)
-; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: lbu a4, 17(a0)
+; RV64I-NEXT: lbu a5, 16(a0)
+; RV64I-NEXT: lbu a6, 18(a0)
+; RV64I-NEXT: lbu a7, 19(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 13(a0)
-; RV64I-NEXT: lbu a6, 12(a0)
-; RV64I-NEXT: lbu a7, 14(a0)
-; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: lbu a5, 21(a0)
+; RV64I-NEXT: lbu a6, 20(a0)
+; RV64I-NEXT: lbu a7, 22(a0)
+; RV64I-NEXT: lbu t0, 23(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -1799,20 +1799,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 17(a0)
-; RV64I-NEXT: lbu a6, 16(a0)
-; RV64I-NEXT: lbu a7, 18(a0)
-; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: lbu a5, 9(a0)
+; RV64I-NEXT: lbu a6, 8(a0)
+; RV64I-NEXT: lbu a7, 10(a0)
+; RV64I-NEXT: lbu t0, 11(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: slli t0, t0, 24
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 21(a0)
-; RV64I-NEXT: lbu a7, 20(a0)
-; RV64I-NEXT: lbu t0, 22(a0)
-; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu a7, 12(a0)
+; RV64I-NEXT: lbu t0, 14(a0)
+; RV64I-NEXT: lbu t1, 15(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
@@ -1866,12 +1866,12 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: sd zero, 24(sp)
-; RV64I-NEXT: sd zero, 16(sp)
; RV64I-NEXT: sd zero, 8(sp)
-; RV64I-NEXT: sd zero, 0(sp)
; RV64I-NEXT: sd a0, 56(sp)
-; RV64I-NEXT: sd a5, 48(sp)
-; RV64I-NEXT: sd a4, 40(sp)
+; RV64I-NEXT: sd a5, 40(sp)
+; RV64I-NEXT: sd zero, 16(sp)
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd a4, 48(sp)
; RV64I-NEXT: sd a3, 32(sp)
; RV64I-NEXT: andi a0, a1, 24
; RV64I-NEXT: addi a3, sp, 32
@@ -1975,60 +1975,60 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: lbu a4, 5(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
-; RV32I-NEXT: lbu a6, 6(a0)
-; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: lbu a4, 9(a0)
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 10(a0)
+; RV32I-NEXT: lbu a7, 11(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a5, a7, a6
; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: lbu a5, 9(a0)
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 10(a0)
-; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: lbu a5, 17(a0)
+; RV32I-NEXT: lbu a6, 16(a0)
+; RV32I-NEXT: lbu a7, 18(a0)
+; RV32I-NEXT: lbu t0, 19(a0)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: lbu a6, 13(a0)
-; RV32I-NEXT: lbu a7, 12(a0)
-; RV32I-NEXT: lbu t0, 14(a0)
-; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: lbu a6, 25(a0)
+; RV32I-NEXT: lbu a7, 24(a0)
+; RV32I-NEXT: lbu t0, 26(a0)
+; RV32I-NEXT: lbu t1, 27(a0)
; RV32I-NEXT: slli a6, a6, 8
; RV32I-NEXT: or a6, a6, a7
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli t1, t1, 24
; RV32I-NEXT: or a7, t1, t0
; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: lbu a7, 17(a0)
-; RV32I-NEXT: lbu t0, 16(a0)
-; RV32I-NEXT: lbu t1, 18(a0)
-; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: lbu a7, 5(a0)
+; RV32I-NEXT: lbu t0, 4(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t0
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
; RV32I-NEXT: or t0, t2, t1
; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: lbu t0, 21(a0)
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 22(a0)
-; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: lbu t0, 13(a0)
+; RV32I-NEXT: lbu t1, 12(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu t3, 15(a0)
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: or t0, t0, t1
; RV32I-NEXT: slli t2, t2, 16
; RV32I-NEXT: slli t3, t3, 24
; RV32I-NEXT: or t1, t3, t2
; RV32I-NEXT: or t0, t1, t0
-; RV32I-NEXT: lbu t1, 25(a0)
-; RV32I-NEXT: lbu t2, 24(a0)
-; RV32I-NEXT: lbu t3, 26(a0)
-; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: lbu t1, 21(a0)
+; RV32I-NEXT: lbu t2, 20(a0)
+; RV32I-NEXT: lbu t3, 22(a0)
+; RV32I-NEXT: lbu t4, 23(a0)
; RV32I-NEXT: slli t1, t1, 8
; RV32I-NEXT: or t1, t1, t2
; RV32I-NEXT: slli t3, t3, 16
@@ -2055,24 +2055,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a1, a1, 24
; RV32I-NEXT: or a1, a1, t4
; RV32I-NEXT: or a1, a1, t2
-; RV32I-NEXT: sw zero, 32(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 20(sp)
-; RV32I-NEXT: sw zero, 16(sp)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 4(sp)
-; RV32I-NEXT: sw a0, 64(sp)
-; RV32I-NEXT: sw t1, 60(sp)
-; RV32I-NEXT: sw t0, 56(sp)
-; RV32I-NEXT: sw a7, 52(sp)
-; RV32I-NEXT: sw a6, 48(sp)
-; RV32I-NEXT: sw a5, 44(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw t1, 52(sp)
+; RV32I-NEXT: sw t0, 44(sp)
+; RV32I-NEXT: sw a7, 36(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a6, 56(sp)
+; RV32I-NEXT: sw a5, 48(sp)
; RV32I-NEXT: sw a4, 40(sp)
-; RV32I-NEXT: sw a3, 36(sp)
+; RV32I-NEXT: sw a3, 32(sp)
; RV32I-NEXT: andi a0, a1, 28
-; RV32I-NEXT: addi a3, sp, 36
+; RV32I-NEXT: addi a3, sp, 32
; RV32I-NEXT: sub a6, a3, a0
; RV32I-NEXT: lw a3, 4(a6)
; RV32I-NEXT: slli a7, a1, 3
@@ -2208,20 +2208,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: lbu a4, 9(a0)
-; RV64I-NEXT: lbu a5, 8(a0)
-; RV64I-NEXT: lbu a6, 10(a0)
-; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: lbu a4, 17(a0)
+; RV64I-NEXT: lbu a5, 16(a0)
+; RV64I-NEXT: lbu a6, 18(a0)
+; RV64I-NEXT: lbu a7, 19(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 13(a0)
-; RV64I-NEXT: lbu a6, 12(a0)
-; RV64I-NEXT: lbu a7, 14(a0)
-; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: lbu a5, 21(a0)
+; RV64I-NEXT: lbu a6, 20(a0)
+; RV64I-NEXT: lbu a7, 22(a0)
+; RV64I-NEXT: lbu t0, 23(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -2230,20 +2230,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 17(a0)
-; RV64I-NEXT: lbu a6, 16(a0)
-; RV64I-NEXT: lbu a7, 18(a0)
-; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: lbu a5, 9(a0)
+; RV64I-NEXT: lbu a6, 8(a0)
+; RV64I-NEXT: lbu a7, 10(a0)
+; RV64I-NEXT: lbu t0, 11(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: slli t0, t0, 24
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 21(a0)
-; RV64I-NEXT: lbu a7, 20(a0)
-; RV64I-NEXT: lbu t0, 22(a0)
-; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu a7, 12(a0)
+; RV64I-NEXT: lbu t0, 14(a0)
+; RV64I-NEXT: lbu t1, 15(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
@@ -2298,12 +2298,12 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a1, a1, a7
; RV64I-NEXT: sraiw a0, a0, 31
; RV64I-NEXT: sd a0, 56(sp)
-; RV64I-NEXT: sd a0, 48(sp)
; RV64I-NEXT: sd a0, 40(sp)
-; RV64I-NEXT: sd a0, 32(sp)
; RV64I-NEXT: sd a6, 24(sp)
-; RV64I-NEXT: sd a5, 16(sp)
-; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a5, 8(sp)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a0, 32(sp)
+; RV64I-NEXT: sd a4, 16(sp)
; RV64I-NEXT: sd a3, 0(sp)
; RV64I-NEXT: andi a0, a1, 24
; RV64I-NEXT: mv a3, sp
@@ -2407,60 +2407,60 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: lbu a4, 5(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
-; RV32I-NEXT: lbu a6, 6(a0)
-; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: lbu a4, 9(a0)
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 10(a0)
+; RV32I-NEXT: lbu a7, 11(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a5, a7, a6
; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: lbu a5, 9(a0)
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 10(a0)
-; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: lbu a5, 17(a0)
+; RV32I-NEXT: lbu a6, 16(a0)
+; RV32I-NEXT: lbu a7, 18(a0)
+; RV32I-NEXT: lbu t0, 19(a0)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: lbu a6, 13(a0)
-; RV32I-NEXT: lbu a7, 12(a0)
-; RV32I-NEXT: lbu t0, 14(a0)
-; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: lbu a6, 25(a0)
+; RV32I-NEXT: lbu a7, 24(a0)
+; RV32I-NEXT: lbu t0, 26(a0)
+; RV32I-NEXT: lbu t1, 27(a0)
; RV32I-NEXT: slli a6, a6, 8
; RV32I-NEXT: or a6, a6, a7
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli t1, t1, 24
; RV32I-NEXT: or a7, t1, t0
; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: lbu a7, 17(a0)
-; RV32I-NEXT: lbu t0, 16(a0)
-; RV32I-NEXT: lbu t1, 18(a0)
-; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: lbu a7, 5(a0)
+; RV32I-NEXT: lbu t0, 4(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t0
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
; RV32I-NEXT: or t0, t2, t1
; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: lbu t0, 21(a0)
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 22(a0)
-; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: lbu t0, 13(a0)
+; RV32I-NEXT: lbu t1, 12(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu t3, 15(a0)
; RV32I-NEXT: slli t0, t0, 8
; RV32I-NEXT: or t0, t0, t1
; RV32I-NEXT: slli t2, t2, 16
; RV32I-NEXT: slli t3, t3, 24
; RV32I-NEXT: or t1, t3, t2
; RV32I-NEXT: or t0, t1, t0
-; RV32I-NEXT: lbu t1, 25(a0)
-; RV32I-NEXT: lbu t2, 24(a0)
-; RV32I-NEXT: lbu t3, 26(a0)
-; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: lbu t1, 21(a0)
+; RV32I-NEXT: lbu t2, 20(a0)
+; RV32I-NEXT: lbu t3, 22(a0)
+; RV32I-NEXT: lbu t4, 23(a0)
; RV32I-NEXT: slli t1, t1, 8
; RV32I-NEXT: or t1, t1, t2
; RV32I-NEXT: slli t3, t3, 16
@@ -2488,24 +2488,24 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: or a1, a1, t5
; RV32I-NEXT: or a1, a1, t3
; RV32I-NEXT: srai a0, a0, 31
-; RV32I-NEXT: sw a0, 64(sp)
; RV32I-NEXT: sw a0, 60(sp)
-; RV32I-NEXT: sw a0, 56(sp)
; RV32I-NEXT: sw a0, 52(sp)
-; RV32I-NEXT: sw a0, 48(sp)
; RV32I-NEXT: sw a0, 44(sp)
-; RV32I-NEXT: sw a0, 40(sp)
; RV32I-NEXT: sw a0, 36(sp)
-; RV32I-NEXT: sw t2, 32(sp)
-; RV32I-NEXT: sw t1, 28(sp)
-; RV32I-NEXT: sw t0, 24(sp)
-; RV32I-NEXT: sw a7, 20(sp)
-; RV32I-NEXT: sw a6, 16(sp)
-; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw t1, 20(sp)
+; RV32I-NEXT: sw t0, 12(sp)
+; RV32I-NEXT: sw a7, 4(sp)
+; RV32I-NEXT: sw a0, 56(sp)
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 40(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw a6, 24(sp)
+; RV32I-NEXT: sw a5, 16(sp)
; RV32I-NEXT: sw a4, 8(sp)
-; RV32I-NEXT: sw a3, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
; RV32I-NEXT: andi a0, a1, 28
-; RV32I-NEXT: addi a3, sp, 4
+; RV32I-NEXT: mv a3, sp
; RV32I-NEXT: add a5, a3, a0
; RV32I-NEXT: lw a3, 4(a5)
; RV32I-NEXT: slli a6, a1, 3
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index 7e879b137b4f0..d85b9430c4fec 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -715,20 +715,20 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: lbu a4, 5(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
-; RV32I-NEXT: lbu a6, 6(a0)
-; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: lbu a4, 9(a0)
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 10(a0)
+; RV32I-NEXT: lbu a7, 11(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a5, a7, a6
; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: lbu a5, 9(a0)
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 10(a0)
-; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: lbu a5, 5(a0)
+; RV32I-NEXT: lbu a6, 4(a0)
+; RV32I-NEXT: lbu a7, 6(a0)
+; RV32I-NEXT: lbu t0, 7(a0)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
@@ -756,12 +756,12 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: or a1, a1, t0
; RV32I-NEXT: or a1, a1, a6
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 20(sp)
-; RV32I-NEXT: sw zero, 16(sp)
; RV32I-NEXT: sw a0, 12(sp)
-; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 4(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw a4, 8(sp)
; RV32I-NEXT: sw a3, 0(sp)
; RV32I-NEXT: srli a0, a1, 3
; RV32I-NEXT: andi a0, a0, 12
@@ -951,20 +951,20 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: lbu a4, 5(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
-; RV32I-NEXT: lbu a6, 6(a0)
-; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: lbu a4, 9(a0)
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 10(a0)
+; RV32I-NEXT: lbu a7, 11(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a5, a7, a6
; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: lbu a5, 9(a0)
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 10(a0)
-; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: lbu a5, 5(a0)
+; RV32I-NEXT: lbu a6, 4(a0)
+; RV32I-NEXT: lbu a7, 6(a0)
+; RV32I-NEXT: lbu t0, 7(a0)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
@@ -992,12 +992,12 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: or a1, a1, t0
; RV32I-NEXT: or a1, a1, a6
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 4(sp)
-; RV32I-NEXT: sw zero, 0(sp)
; RV32I-NEXT: sw a0, 28(sp)
-; RV32I-NEXT: sw a5, 24(sp)
-; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a5, 20(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a4, 24(sp)
; RV32I-NEXT: sw a3, 16(sp)
; RV32I-NEXT: srli a0, a1, 3
; RV32I-NEXT: andi a0, a0, 12
@@ -1187,20 +1187,20 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: lbu a4, 5(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
-; RV32I-NEXT: lbu a6, 6(a0)
-; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: lbu a4, 9(a0)
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 10(a0)
+; RV32I-NEXT: lbu a7, 11(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a5, a7, a6
; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: lbu a5, 9(a0)
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 10(a0)
-; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: lbu a5, 5(a0)
+; RV32I-NEXT: lbu a6, 4(a0)
+; RV32I-NEXT: lbu a7, 6(a0)
+; RV32I-NEXT: lbu t0, 7(a0)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
@@ -1229,12 +1229,12 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: or a1, a1, a7
; RV32I-NEXT: srai a0, a0, 31
; RV32I-NEXT: sw a0, 28(sp)
-; RV32I-NEXT: sw a0, 24(sp)
; RV32I-NEXT: sw a0, 20(sp)
-; RV32I-NEXT: sw a0, 16(sp)
; RV32I-NEXT: sw a6, 12(sp)
-; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 4(sp)
+; RV32I-NEXT: sw a0, 24(sp)
+; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: sw a4, 8(sp)
; RV32I-NEXT: sw a3, 0(sp)
; RV32I-NEXT: srli a0, a1, 3
; RV32I-NEXT: andi a0, a0, 12
@@ -1322,20 +1322,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: lbu a4, 9(a0)
-; RV64I-NEXT: lbu a5, 8(a0)
-; RV64I-NEXT: lbu a6, 10(a0)
-; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: lbu a4, 17(a0)
+; RV64I-NEXT: lbu a5, 16(a0)
+; RV64I-NEXT: lbu a6, 18(a0)
+; RV64I-NEXT: lbu a7, 19(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 13(a0)
-; RV64I-NEXT: lbu a6, 12(a0)
-; RV64I-NEXT: lbu a7, 14(a0)
-; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: lbu a5, 21(a0)
+; RV64I-NEXT: lbu a6, 20(a0)
+; RV64I-NEXT: lbu a7, 22(a0)
+; RV64I-NEXT: lbu t0, 23(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -1344,20 +1344,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 17(a0)
-; RV64I-NEXT: lbu a6, 16(a0)
-; RV64I-NEXT: lbu a7, 18(a0)
-; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: lbu a5, 9(a0)
+; RV64I-NEXT: lbu a6, 8(a0)
+; RV64I-NEXT: lbu a7, 10(a0)
+; RV64I-NEXT: lbu t0, 11(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: slli t0, t0, 24
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 21(a0)
-; RV64I-NEXT: lbu a7, 20(a0)
-; RV64I-NEXT: lbu t0, 22(a0)
-; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu a7, 12(a0)
+; RV64I-NEXT: lbu t0, 14(a0)
+; RV64I-NEXT: lbu t1, 15(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
@@ -1411,12 +1411,12 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: sd zero, 56(sp)
-; RV64I-NEXT: sd zero, 48(sp)
; RV64I-NEXT: sd zero, 40(sp)
-; RV64I-NEXT: sd zero, 32(sp)
; RV64I-NEXT: sd a0, 24(sp)
-; RV64I-NEXT: sd a5, 16(sp)
-; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a5, 8(sp)
+; RV64I-NEXT: sd zero, 48(sp)
+; RV64I-NEXT: sd zero, 32(sp)
+; RV64I-NEXT: sd a4, 16(sp)
; RV64I-NEXT: sd a3, 0(sp)
; RV64I-NEXT: srli a0, a1, 3
; RV64I-NEXT: andi a0, a0, 24
@@ -1517,60 +1517,60 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: lbu a4, 5(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
-; RV32I-NEXT: lbu a6, 6(a0)
-; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: lbu a4, 9(a0)
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 10(a0)
+; RV32I-NEXT: lbu a7, 11(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a5, a7, a6
; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: lbu a5, 9(a0)
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 10(a0)
-; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: lbu a5, 17(a0)
+; RV32I-NEXT: lbu a6, 16(a0)
+; RV32I-NEXT: lbu a7, 18(a0)
+; RV32I-NEXT: lbu t0, 19(a0)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: lbu a6, 13(a0)
-; RV32I-NEXT: lbu a7, 12(a0)
-; RV32I-NEXT: lbu t0, 14(a0)
-; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: lbu a6, 25(a0)
+; RV32I-NEXT: lbu a7, 24(a0)
+; RV32I-NEXT: lbu t0, 26(a0)
+; RV32I-NEXT: lbu t1, 27(a0)
; RV32I-NEXT: slli a6, a6, 8
; RV32I-NEXT: or a6, a6, a7
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli t1, t1, 24
; RV32I-NEXT: or a7, t1, t0
; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: lbu a7, 17(a0)
-; RV32I-NEXT: lbu t0, 16(a0)
-; RV32I-NEXT: lbu t1, 18(a0)
-; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: lbu a7, 5(a0)
+; RV32I-NEXT: lbu t0, 4(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t0
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
; RV32I-NEXT: or t0, t2, t1
; RV32I-NEXT: or t0, t0, a7
-; RV32I-NEXT: lbu a7, 21(a0)
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 22(a0)
-; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: lbu a7, 13(a0)
+; RV32I-NEXT: lbu t1, 12(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu t3, 15(a0)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t1
; RV32I-NEXT: slli t2, t2, 16
; RV32I-NEXT: slli t3, t3, 24
; RV32I-NEXT: or t1, t3, t2
; RV32I-NEXT: or t1, t1, a7
-; RV32I-NEXT: lbu a7, 25(a0)
-; RV32I-NEXT: lbu t2, 24(a0)
-; RV32I-NEXT: lbu t3, 26(a0)
-; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: lbu a7, 21(a0)
+; RV32I-NEXT: lbu t2, 20(a0)
+; RV32I-NEXT: lbu t3, 22(a0)
+; RV32I-NEXT: lbu t4, 23(a0)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t2
; RV32I-NEXT: slli t3, t3, 16
@@ -1598,20 +1598,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: or a1, a1, t4
; RV32I-NEXT: or a7, a1, a7
; RV32I-NEXT: sw zero, 60(sp)
-; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 52(sp)
-; RV32I-NEXT: sw zero, 48(sp)
; RV32I-NEXT: sw zero, 44(sp)
-; RV32I-NEXT: sw zero, 40(sp)
; RV32I-NEXT: sw zero, 36(sp)
-; RV32I-NEXT: sw zero, 32(sp)
; RV32I-NEXT: sw a0, 28(sp)
-; RV32I-NEXT: sw t2, 24(sp)
-; RV32I-NEXT: sw t1, 20(sp)
-; RV32I-NEXT: sw t0, 16(sp)
-; RV32I-NEXT: sw a6, 12(sp)
-; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw t2, 20(sp)
+; RV32I-NEXT: sw t1, 12(sp)
+; RV32I-NEXT: sw t0, 4(sp)
+; RV32I-NEXT: sw zero, 56(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 40(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw a6, 24(sp)
+; RV32I-NEXT: sw a5, 16(sp)
+; RV32I-NEXT: sw a4, 8(sp)
; RV32I-NEXT: sw a3, 0(sp)
; RV32I-NEXT: srli a0, a7, 3
; RV32I-NEXT: andi a0, a0, 28
@@ -1746,20 +1746,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: lbu a4, 9(a0)
-; RV64I-NEXT: lbu a5, 8(a0)
-; RV64I-NEXT: lbu a6, 10(a0)
-; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: lbu a4, 17(a0)
+; RV64I-NEXT: lbu a5, 16(a0)
+; RV64I-NEXT: lbu a6, 18(a0)
+; RV64I-NEXT: lbu a7, 19(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 13(a0)
-; RV64I-NEXT: lbu a6, 12(a0)
-; RV64I-NEXT: lbu a7, 14(a0)
-; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: lbu a5, 21(a0)
+; RV64I-NEXT: lbu a6, 20(a0)
+; RV64I-NEXT: lbu a7, 22(a0)
+; RV64I-NEXT: lbu t0, 23(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -1768,20 +1768,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 17(a0)
-; RV64I-NEXT: lbu a6, 16(a0)
-; RV64I-NEXT: lbu a7, 18(a0)
-; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: lbu a5, 9(a0)
+; RV64I-NEXT: lbu a6, 8(a0)
+; RV64I-NEXT: lbu a7, 10(a0)
+; RV64I-NEXT: lbu t0, 11(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: slli t0, t0, 24
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 21(a0)
-; RV64I-NEXT: lbu a7, 20(a0)
-; RV64I-NEXT: lbu t0, 22(a0)
-; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu a7, 12(a0)
+; RV64I-NEXT: lbu t0, 14(a0)
+; RV64I-NEXT: lbu t1, 15(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
@@ -1835,12 +1835,12 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: or a1, a1, a6
; RV64I-NEXT: sd zero, 24(sp)
-; RV64I-NEXT: sd zero, 16(sp)
; RV64I-NEXT: sd zero, 8(sp)
-; RV64I-NEXT: sd zero, 0(sp)
; RV64I-NEXT: sd a0, 56(sp)
-; RV64I-NEXT: sd a5, 48(sp)
-; RV64I-NEXT: sd a4, 40(sp)
+; RV64I-NEXT: sd a5, 40(sp)
+; RV64I-NEXT: sd zero, 16(sp)
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd a4, 48(sp)
; RV64I-NEXT: sd a3, 32(sp)
; RV64I-NEXT: srli a0, a1, 3
; RV64I-NEXT: andi a0, a0, 24
@@ -1941,60 +1941,60 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: lbu a4, 5(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
-; RV32I-NEXT: lbu a6, 6(a0)
-; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: lbu a4, 9(a0)
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 10(a0)
+; RV32I-NEXT: lbu a7, 11(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a5, a7, a6
; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: lbu a5, 9(a0)
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 10(a0)
-; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: lbu a5, 17(a0)
+; RV32I-NEXT: lbu a6, 16(a0)
+; RV32I-NEXT: lbu a7, 18(a0)
+; RV32I-NEXT: lbu t0, 19(a0)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: lbu a6, 13(a0)
-; RV32I-NEXT: lbu a7, 12(a0)
-; RV32I-NEXT: lbu t0, 14(a0)
-; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: lbu a6, 25(a0)
+; RV32I-NEXT: lbu a7, 24(a0)
+; RV32I-NEXT: lbu t0, 26(a0)
+; RV32I-NEXT: lbu t1, 27(a0)
; RV32I-NEXT: slli a6, a6, 8
; RV32I-NEXT: or a6, a6, a7
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli t1, t1, 24
; RV32I-NEXT: or a7, t1, t0
; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: lbu a7, 17(a0)
-; RV32I-NEXT: lbu t0, 16(a0)
-; RV32I-NEXT: lbu t1, 18(a0)
-; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: lbu a7, 5(a0)
+; RV32I-NEXT: lbu t0, 4(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t0
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
; RV32I-NEXT: or t0, t2, t1
; RV32I-NEXT: or t0, t0, a7
-; RV32I-NEXT: lbu a7, 21(a0)
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 22(a0)
-; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: lbu a7, 13(a0)
+; RV32I-NEXT: lbu t1, 12(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu t3, 15(a0)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t1
; RV32I-NEXT: slli t2, t2, 16
; RV32I-NEXT: slli t3, t3, 24
; RV32I-NEXT: or t1, t3, t2
; RV32I-NEXT: or t1, t1, a7
-; RV32I-NEXT: lbu a7, 25(a0)
-; RV32I-NEXT: lbu t2, 24(a0)
-; RV32I-NEXT: lbu t3, 26(a0)
-; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: lbu a7, 21(a0)
+; RV32I-NEXT: lbu t2, 20(a0)
+; RV32I-NEXT: lbu t3, 22(a0)
+; RV32I-NEXT: lbu t4, 23(a0)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t2
; RV32I-NEXT: slli t3, t3, 16
@@ -2022,20 +2022,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: or a1, a1, t4
; RV32I-NEXT: or a7, a1, a7
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 20(sp)
-; RV32I-NEXT: sw zero, 16(sp)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 4(sp)
-; RV32I-NEXT: sw zero, 0(sp)
; RV32I-NEXT: sw a0, 60(sp)
-; RV32I-NEXT: sw t2, 56(sp)
-; RV32I-NEXT: sw t1, 52(sp)
-; RV32I-NEXT: sw t0, 48(sp)
-; RV32I-NEXT: sw a6, 44(sp)
-; RV32I-NEXT: sw a5, 40(sp)
-; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw t2, 52(sp)
+; RV32I-NEXT: sw t1, 44(sp)
+; RV32I-NEXT: sw t0, 36(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a6, 56(sp)
+; RV32I-NEXT: sw a5, 48(sp)
+; RV32I-NEXT: sw a4, 40(sp)
; RV32I-NEXT: sw a3, 32(sp)
; RV32I-NEXT: srli a0, a7, 3
; RV32I-NEXT: andi a0, a0, 28
@@ -2170,20 +2170,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a4, a5, a4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: lbu a4, 9(a0)
-; RV64I-NEXT: lbu a5, 8(a0)
-; RV64I-NEXT: lbu a6, 10(a0)
-; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: lbu a4, 17(a0)
+; RV64I-NEXT: lbu a5, 16(a0)
+; RV64I-NEXT: lbu a6, 18(a0)
+; RV64I-NEXT: lbu a7, 19(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 13(a0)
-; RV64I-NEXT: lbu a6, 12(a0)
-; RV64I-NEXT: lbu a7, 14(a0)
-; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: lbu a5, 21(a0)
+; RV64I-NEXT: lbu a6, 20(a0)
+; RV64I-NEXT: lbu a7, 22(a0)
+; RV64I-NEXT: lbu t0, 23(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -2192,20 +2192,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 17(a0)
-; RV64I-NEXT: lbu a6, 16(a0)
-; RV64I-NEXT: lbu a7, 18(a0)
-; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: lbu a5, 9(a0)
+; RV64I-NEXT: lbu a6, 8(a0)
+; RV64I-NEXT: lbu a7, 10(a0)
+; RV64I-NEXT: lbu t0, 11(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: slli t0, t0, 24
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 21(a0)
-; RV64I-NEXT: lbu a7, 20(a0)
-; RV64I-NEXT: lbu t0, 22(a0)
-; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: lbu a6, 13(a0)
+; RV64I-NEXT: lbu a7, 12(a0)
+; RV64I-NEXT: lbu t0, 14(a0)
+; RV64I-NEXT: lbu t1, 15(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
@@ -2260,12 +2260,12 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a1, a1, a7
; RV64I-NEXT: sraiw a0, a0, 31
; RV64I-NEXT: sd a0, 56(sp)
-; RV64I-NEXT: sd a0, 48(sp)
; RV64I-NEXT: sd a0, 40(sp)
-; RV64I-NEXT: sd a0, 32(sp)
; RV64I-NEXT: sd a6, 24(sp)
-; RV64I-NEXT: sd a5, 16(sp)
-; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a5, 8(sp)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a0, 32(sp)
+; RV64I-NEXT: sd a4, 16(sp)
; RV64I-NEXT: sd a3, 0(sp)
; RV64I-NEXT: srli a0, a1, 3
; RV64I-NEXT: andi a0, a0, 24
@@ -2366,60 +2366,60 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: lbu a4, 5(a0)
-; RV32I-NEXT: lbu a5, 4(a0)
-; RV32I-NEXT: lbu a6, 6(a0)
-; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: lbu a4, 9(a0)
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 10(a0)
+; RV32I-NEXT: lbu a7, 11(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: or a4, a4, a5
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a5, a7, a6
; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: lbu a5, 9(a0)
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 10(a0)
-; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: lbu a5, 17(a0)
+; RV32I-NEXT: lbu a6, 16(a0)
+; RV32I-NEXT: lbu a7, 18(a0)
+; RV32I-NEXT: lbu t0, 19(a0)
; RV32I-NEXT: slli a5, a5, 8
; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: lbu a6, 13(a0)
-; RV32I-NEXT: lbu a7, 12(a0)
-; RV32I-NEXT: lbu t0, 14(a0)
-; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: lbu a6, 25(a0)
+; RV32I-NEXT: lbu a7, 24(a0)
+; RV32I-NEXT: lbu t0, 26(a0)
+; RV32I-NEXT: lbu t1, 27(a0)
; RV32I-NEXT: slli a6, a6, 8
; RV32I-NEXT: or a6, a6, a7
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli t1, t1, 24
; RV32I-NEXT: or a7, t1, t0
; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: lbu a7, 17(a0)
-; RV32I-NEXT: lbu t0, 16(a0)
-; RV32I-NEXT: lbu t1, 18(a0)
-; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: lbu a7, 5(a0)
+; RV32I-NEXT: lbu t0, 4(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t0
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
; RV32I-NEXT: or t0, t2, t1
; RV32I-NEXT: or t0, t0, a7
-; RV32I-NEXT: lbu a7, 21(a0)
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 22(a0)
-; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: lbu a7, 13(a0)
+; RV32I-NEXT: lbu t1, 12(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu t3, 15(a0)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t1
; RV32I-NEXT: slli t2, t2, 16
; RV32I-NEXT: slli t3, t3, 24
; RV32I-NEXT: or t1, t3, t2
; RV32I-NEXT: or t1, t1, a7
-; RV32I-NEXT: lbu a7, 25(a0)
-; RV32I-NEXT: lbu t2, 24(a0)
-; RV32I-NEXT: lbu t3, 26(a0)
-; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: lbu a7, 21(a0)
+; RV32I-NEXT: lbu t2, 20(a0)
+; RV32I-NEXT: lbu t3, 22(a0)
+; RV32I-NEXT: lbu t4, 23(a0)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t2
; RV32I-NEXT: slli t3, t3, 16
@@ -2448,20 +2448,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: or a7, a1, a7
; RV32I-NEXT: srai a0, a0, 31
; RV32I-NEXT: sw a0, 60(sp)
-; RV32I-NEXT: sw a0, 56(sp)
; RV32I-NEXT: sw a0, 52(sp)
-; RV32I-NEXT: sw a0, 48(sp)
; RV32I-NEXT: sw a0, 44(sp)
-; RV32I-NEXT: sw a0, 40(sp)
; RV32I-NEXT: sw a0, 36(sp)
-; RV32I-NEXT: sw a0, 32(sp)
; RV32I-NEXT: sw t3, 28(sp)
-; RV32I-NEXT: sw t2, 24(sp)
-; RV32I-NEXT: sw t1, 20(sp)
-; RV32I-NEXT: sw t0, 16(sp)
-; RV32I-NEXT: sw a6, 12(sp)
-; RV32I-NEXT: sw a5, 8(sp)
-; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw t2, 20(sp)
+; RV32I-NEXT: sw t1, 12(sp)
+; RV32I-NEXT: sw t0, 4(sp)
+; RV32I-NEXT: sw a0, 56(sp)
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 40(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw a6, 24(sp)
+; RV32I-NEXT: sw a5, 16(sp)
+; RV32I-NEXT: sw a4, 8(sp)
; RV32I-NEXT: sw a3, 0(sp)
; RV32I-NEXT: srli a0, a7, 3
; RV32I-NEXT: andi a0, a0, 28
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 54106bde42527..33654bb250b1e 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -174,93 +174,96 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-LABEL: scalar_i128:
; X86: # %bb.0: # %_udiv-special-cases
; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $152, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $176, %esp
+; X86-NEXT: movl 20(%ebp), %edx
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: xorl %eax, %esi
-; X86-NEXT: movl %esi, %edi
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: xorl %eax, %edx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: movl 16(%ebp), %edx
; X86-NEXT: xorl %eax, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl 12(%ebp), %ecx
; X86-NEXT: xorl %eax, %ecx
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %edx
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
; X86-NEXT: sbbl %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl 40(%ebp), %ecx
+; X86-NEXT: movl %ecx, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: xorl %edx, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ecx, %esi
; X86-NEXT: xorl %edx, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: xorl %edx, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl 36(%ebp), %ecx
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: movl 32(%ebp), %ebx
+; X86-NEXT: xorl %edx, %ebx
+; X86-NEXT: movl 28(%ebp), %edi
; X86-NEXT: xorl %edx, %edi
; X86-NEXT: subl %edx, %edi
-; X86-NEXT: sbbl %edx, %ebp
-; X86-NEXT: sbbl %edx, %esi
; X86-NEXT: sbbl %edx, %ebx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edx, %esi
; X86-NEXT: xorl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: sete %cl
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl (%esp), %edx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: bsrl %eax, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
-; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: testl %esi, %esi
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: bsrl %ebp, %edx
+; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: bsrl %edi, %edi
; X86-NEXT: xorl $31, %edi
; X86-NEXT: addl $32, %edi
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: testl %ebp, %ebp
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %edi
; X86-NEXT: addl $64, %edi
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: orl %esi, %edx
; X86-NEXT: cmovnel %ecx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: bsrl %eax, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: bsrl %ebp, %ecx
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: bsrl %ebx, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
-; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: testl %eax, %eax
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: bsrl %eax, %esi
; X86-NEXT: xorl $31, %esi
; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -269,133 +272,131 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: testl %eax, %eax
; X86-NEXT: cmovnel %esi, %edx
; X86-NEXT: addl $64, %edx
-; X86-NEXT: orl %ebx, %ebp
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: cmovnel %ecx, %edx
-; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: xorl %ebx, %ebx
; X86-NEXT: subl %edx, %edi
-; X86-NEXT: movl $0, %ebp
-; X86-NEXT: sbbl %ebp, %ebp
-; X86-NEXT: movl $0, %edx
-; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
; X86-NEXT: movl $0, %eax
; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
; X86-NEXT: movl $127, %ecx
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: cmpl %edi, %ecx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl $0, %ecx
-; X86-NEXT: sbbl %ebp, %ecx
+; X86-NEXT: movl %esi, %edi
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: movl $0, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: setb %cl
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: cmovnel %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT: cmovnel %esi, %ecx
-; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: jne .LBB4_1
-; X86-NEXT: # %bb.8: # %_udiv-special-cases
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: cmovnel %ebx, %esi
+; X86-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NEXT: cmovnel %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovnel %ebx, %eax
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: jne .LBB4_8
+; X86-NEXT: # %bb.1: # %_udiv-special-cases
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: xorl $127, %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: je .LBB4_9
-; X86-NEXT: # %bb.5: # %udiv-bb1
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: je .LBB4_8
+; X86-NEXT: # %bb.2: # %udiv-bb1
+; X86-NEXT: xorps %xmm0, %xmm0
+; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorb $127, %cl
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 144(%esp,%eax), %edx
-; X86-NEXT: movl 148(%esp,%eax), %esi
+; X86-NEXT: movl 152(%esp,%eax), %edx
+; X86-NEXT: movl 156(%esp,%eax), %esi
; X86-NEXT: shldl %cl, %edx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 136(%esp,%eax), %esi
-; X86-NEXT: movl 140(%esp,%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %edx
-; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl 148(%esp,%eax), %ebx
+; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: movl 144(%esp,%eax), %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: shldl %cl, %esi, %eax
; X86-NEXT: shll %cl, %esi
-; X86-NEXT: addl $1, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $0, %eax
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $1, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: jae .LBB4_2
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jae .LBB4_3
; X86-NEXT: # %bb.6:
-; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: jmp .LBB4_7
-; X86-NEXT: .LBB4_1:
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: jmp .LBB4_9
-; X86-NEXT: .LBB4_2: # %udiv-preheader
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: xorl %edi, %edi
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jmp .LBB4_7
+; X86-NEXT: .LBB4_3: # %udiv-preheader
+; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 108(%esp,%eax), %edi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %al, %edx
-; X86-NEXT: movl 100(%esp,%edx), %ebx
-; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT: movl 96(%esp,%edx), %esi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl 104(%esp,%eax), %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: shrdl %cl, %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 100(%esp,%eax), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shrdl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl 96(%esp,%eax), %eax
; X86-NEXT: shrdl %cl, %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 88(%esp,%edx), %eax
-; X86-NEXT: movl 92(%esp,%edx), %edx
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: shrdl %cl, %esi, %ebp
-; X86-NEXT: shrl %cl, %ebx
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shrdl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -410,174 +411,177 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: .p2align 4, 0x90
-; X86-NEXT: .LBB4_3: # %udiv-do-while
+; X86-NEXT: .LBB4_4: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl $1, %ebp, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: shldl $1, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: shldl $1, %eax, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: orl %esi, %eax
+; X86-NEXT: shldl $1, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, %ecx
-; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: shldl $1, %ebx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ebx
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: shldl $1, %ecx, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: orl %eax, %ecx
; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT: cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: addl %edx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sbbl %ebx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: sbbl %edi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl $1, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: subl %ecx, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: sbbl %edx, %ebp
-; X86-NEXT: sbbl %ebx, %edi
+; X86-NEXT: subl %ecx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %edx, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: sbbl %esi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: addl $-1, %edx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: adcl $-1, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl $-1, %ecx
; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: orl %ebx, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: jne .LBB4_3
-; X86-NEXT: # %bb.4:
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: orl %eax, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB4_4
+; X86-NEXT: # %bb.5:
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: .LBB4_7: # %udiv-loop-exit
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %eax
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: shldl $1, %ecx, %edx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: shldl $1, %esi, %ecx
-; X86-NEXT: orl %ebx, %ecx
-; X86-NEXT: addl %esi, %esi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: .LBB4_9: # %udiv-end
-; X86-NEXT: xorl %ebp, %eax
-; X86-NEXT: xorl %ebp, %edx
-; X86-NEXT: xorl %ebp, %ecx
-; X86-NEXT: xorl %ebp, %esi
-; X86-NEXT: subl %ebp, %esi
-; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT: sbbl %ebp, %ecx
-; X86-NEXT: sbbl %ebp, %edx
-; X86-NEXT: sbbl %ebp, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %esi, (%edi)
-; X86-NEXT: movl %ecx, 4(%edi)
-; X86-NEXT: movl %edx, 8(%edi)
-; X86-NEXT: movl %eax, 12(%edi)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: shldl $1, %ebx, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: addl %ebx, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: .LBB4_8: # %udiv-end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, %esi
+; X86-NEXT: xorl %ecx, %edx
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: xorl %ecx, %ebx
+; X86-NEXT: subl %ecx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %ecx, %eax
+; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: sbbl %ecx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%ebp), %ecx
+; X86-NEXT: movl %ebx, (%ecx)
+; X86-NEXT: movl %eax, 4(%ecx)
+; X86-NEXT: movl %edx, 8(%ecx)
+; X86-NEXT: movl %esi, 12(%ecx)
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl 28(%ebp), %ecx
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebp
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %ebp
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl 32(%ebp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %edx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: setb %cl
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NEXT: mull %esi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: imull %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: imull %eax, %ebx
; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: imull %esi, %edi
; X86-NEXT: addl %edx, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %ebp, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-NEXT: imull %edx, %esi
+; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: imull (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 40(%ebp), %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: imull %edx, %ebx
; X86-NEXT: mull %edx
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: sbbl %eax, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: sbbl %esi, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ebx, 8(%eax)
-; X86-NEXT: movl %edi, 12(%eax)
-; X86-NEXT: addl $152, %esp
+; X86-NEXT: adcl %edi, %ebx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl 12(%ebp), %edx
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl 20(%ebp), %edi
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: sbbl %ebx, %esi
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 84f35c6485abe..d4f62d1aa7c1c 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -174,68 +174,69 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-LABEL: scalar_i128:
; X86: # %bb.0: # %_udiv-special-cases
; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $132, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $160, %esp
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: movl 40(%ebp), %edi
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl 36(%ebp), %ecx
; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sete %bl
-; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: orl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: orl 24(%ebp), %eax
+; X86-NEXT: movl 20(%ebp), %esi
+; X86-NEXT: movl 12(%ebp), %edx
+; X86-NEXT: orl %esi, %edx
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
; X86-NEXT: orb %bl, %al
-; X86-NEXT: movb %al, (%esp) # 1-byte Spill
-; X86-NEXT: bsrl %esi, %edx
+; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: bsrl %edi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %edi, %ecx
+; X86-NEXT: bsrl 36(%ebp), %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
-; X86-NEXT: testl %esi, %esi
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl 32(%ebp), %ebx
; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %ebp, %ebp
-; X86-NEXT: xorl $31, %ebp
-; X86-NEXT: addl $32, %ebp
+; X86-NEXT: bsrl 28(%ebp), %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: addl $32, %eax
; X86-NEXT: testl %ebx, %ebx
-; X86-NEXT: cmovnel %edx, %ebp
-; X86-NEXT: addl $64, %ebp
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: cmovnel %ecx, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: cmovnel %edx, %eax
+; X86-NEXT: addl $64, %eax
+; X86-NEXT: movl 36(%ebp), %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: movl 24(%ebp), %ebx
; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %edi, %ecx
+; X86-NEXT: bsrl %esi, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: bsrl %eax, %esi
+; X86-NEXT: movl 16(%ebp), %edi
+; X86-NEXT: bsrl %edi, %esi
; X86-NEXT: xorl $31, %esi
-; X86-NEXT: bsrl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: bsrl 12(%ebp), %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: addl $32, %edx
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %esi, %edx
; X86-NEXT: addl $64, %edx
+; X86-NEXT: movl 20(%ebp), %edi
; X86-NEXT: movl %edi, %esi
; X86-NEXT: orl %ebx, %esi
; X86-NEXT: cmovnel %ecx, %edx
-; X86-NEXT: subl %edx, %ebp
+; X86-NEXT: subl %edx, %eax
; X86-NEXT: movl $0, %edx
; X86-NEXT: sbbl %edx, %edx
; X86-NEXT: movl $0, %ebx
@@ -243,7 +244,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl $0, %esi
; X86-NEXT: sbbl %esi, %esi
; X86-NEXT: movl $127, %ecx
-; X86-NEXT: cmpl %ebp, %ecx
+; X86-NEXT: cmpl %eax, %ecx
; X86-NEXT: movl $0, %ecx
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl $0, %ecx
@@ -251,9 +252,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl $0, %ecx
; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: setb %cl
-; X86-NEXT: orb (%esp), %cl # 1-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: xorl $127, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %ebx, %eax
@@ -263,35 +263,31 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
; X86-NEXT: testb %cl, %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl $0, %edx
-; X86-NEXT: cmovnel %edx, %esi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: cmovnel %edx, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: cmovnel %edx, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl $0, %edi
-; X86-NEXT: cmovnel %edi, %edx
-; X86-NEXT: orb %cl, %al
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movb %cl, %ah
+; X86-NEXT: movl 24(%ebp), %ebx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: cmovnel %ecx, %ebx
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: cmovnel %ecx, %edx
+; X86-NEXT: movl 16(%ebp), %edi
+; X86-NEXT: cmovnel %ecx, %edi
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: cmovnel %esi, %ecx
+; X86-NEXT: orb %ah, %al
+; X86-NEXT: movl 44(%ebp), %eax
; X86-NEXT: jne .LBB4_7
; X86-NEXT: # %bb.1: # %udiv-bb1
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: xorps %xmm0, %xmm0
+; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 20(%ebp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 24(%ebp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 12(%ebp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl %ebx, %ecx
; X86-NEXT: xorb $127, %cl
@@ -300,144 +296,142 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 124(%esp,%eax), %ebp
-; X86-NEXT: movl 128(%esp,%eax), %edx
-; X86-NEXT: shldl %cl, %ebp, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 116(%esp,%eax), %edx
-; X86-NEXT: movl 120(%esp,%eax), %eax
-; X86-NEXT: shldl %cl, %eax, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: shldl %cl, %edx, %ebp
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 136(%esp,%eax), %edx
+; X86-NEXT: movl 140(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 132(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %edx
+; X86-NEXT: movl 128(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: addl $1, %ebx
-; X86-NEXT: movl %ebx, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: jae .LBB4_2
; X86-NEXT: # %bb.5:
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: xorl %esi, %esi
; X86-NEXT: jmp .LBB4_6
; X86-NEXT: .LBB4_2: # %udiv-preheader
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 24(%ebp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl 20(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 80(%esp,%eax), %esi
-; X86-NEXT: movl %ebp, %edi
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 76(%esp,%eax), %ebp
-; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: shrdl %cl, %esi, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 68(%esp,%eax), %ecx
+; X86-NEXT: movzbl %al, %esi
+; X86-NEXT: movl 92(%esp,%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 88(%esp,%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 72(%esp,%eax), %eax
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-NEXT: shrdl %cl, %edx, %ebx
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: shrl %cl, %esi
-; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 84(%esp,%esi), %edx
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%esi), %eax
+; X86-NEXT: shrdl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: movl 28(%ebp), %eax
; X86-NEXT: addl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 32(%ebp), %eax
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %esi, %esi
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB4_3: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, %ebp
-; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shldl $1, %edx, %ebx
+; X86-NEXT: shldl $1, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: shldl $1, %edx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: shldl $1, %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl $1, %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: orl %ebp, %ecx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %edi, %ecx
+; X86-NEXT: orl %esi, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl $1, %edi, %eax
-; X86-NEXT: orl %ebp, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl $1, %eax, %edi
-; X86-NEXT: orl %ebp, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %eax, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: sbbl %ebx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl $1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: andl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %ecx, %ebp
-; X86-NEXT: andl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $1, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %esi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: sbbl %ebp, %ebx
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: andl 40(%ebp), %edi
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl 36(%ebp), %esi
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl 32(%ebp), %edx
+; X86-NEXT: andl 28(%ebp), %ecx
+; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edx, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT: sbbl %edi, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %edi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: adcl $-1, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: adcl $-1, %esi
@@ -445,96 +439,95 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %ebx, %ecx
; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: jne .LBB4_3
; X86-NEXT: # %bb.4:
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %edi, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: .LBB4_6: # %udiv-loop-exit
-; X86-NEXT: shldl $1, %ebx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: shldl $1, %ebp, %ebx
-; X86-NEXT: orl %eax, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %ebp
-; X86-NEXT: orl %eax, %ebp
+; X86-NEXT: .LBB4_6: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: shldl $1, %edi, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: orl %esi, %edi
; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl 44(%ebp), %eax
; X86-NEXT: .LBB4_7: # %udiv-end
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, (%esi)
-; X86-NEXT: movl %ebp, 4(%esi)
-; X86-NEXT: movl %ebx, 8(%esi)
-; X86-NEXT: movl %edx, 12(%esi)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl %edi, 4(%eax)
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%ebp), %eax
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: imull %ebp, %esi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: imull %edi, %esi
; X86-NEXT: mull %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %esi, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %ebp, %ecx
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: imull %ebp, %edi
+; X86-NEXT: movl 40(%ebp), %edi
+; X86-NEXT: imull %ecx, %edi
; X86-NEXT: addl %edx, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %eax, %ebx
-; X86-NEXT: addl %edi, %ebx
-; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: movl 28(%ebp), %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: imull 28(%ebp), %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: movl 32(%ebp), %edx
+; X86-NEXT: imull %edx, %esi
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull 32(%ebp)
+; X86-NEXT: movl 16(%ebp), %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull 32(%ebp)
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %ebx, %edx
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: sbbl %eax, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 12(%ebp), %ebx
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 20(%ebp), %edi
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: movl 24(%ebp), %ecx
; X86-NEXT: sbbl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %esi, (%eax)
-; X86-NEXT: movl %edi, 4(%eax)
-; X86-NEXT: movl %ebx, 8(%eax)
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %ebx, (%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: addl $132, %esp
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index 3dbd0213293bb..17ad1ef67ef79 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -22,49 +22,49 @@ define void @f() nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $160, %esp
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: subl $176, %esp
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movzbl (%eax), %eax
; X86-NEXT: movzbl (%eax), %ecx
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: divb %cl
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: shll $30, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: sarl $30, %ecx
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: xorl %eax, %edi
; X86-NEXT: xorl %eax, %edx
+; X86-NEXT: xorl %eax, %esi
; X86-NEXT: shrdl $1, %eax, %ecx
-; X86-NEXT: xorl %ecx, %esi
-; X86-NEXT: subl %ecx, %esi
+; X86-NEXT: xorl %ecx, %edi
+; X86-NEXT: subl %ecx, %edi
+; X86-NEXT: sbbl %eax, %esi
; X86-NEXT: sbbl %eax, %edx
-; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: andl $3, %edi
-; X86-NEXT: testl %edx, %edx
+; X86-NEXT: andl $3, %edx
+; X86-NEXT: testl %esi, %esi
; X86-NEXT: jne .LBB0_1
; X86-NEXT: # %bb.2: # %BB_udiv-special-cases
-; X86-NEXT: bsrl %esi, %eax
+; X86-NEXT: bsrl %edi, %eax
; X86-NEXT: xorl $31, %eax
; X86-NEXT: addl $32, %eax
; X86-NEXT: jmp .LBB0_3
; X86-NEXT: .LBB0_1:
-; X86-NEXT: bsrl %edx, %eax
+; X86-NEXT: bsrl %esi, %eax
; X86-NEXT: xorl $31, %eax
; X86-NEXT: .LBB0_3: # %BB_udiv-special-cases
; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: jne .LBB0_4
; X86-NEXT: # %bb.5: # %BB_udiv-special-cases
; X86-NEXT: addl $64, %eax
; X86-NEXT: jmp .LBB0_6
; X86-NEXT: .LBB0_4:
-; X86-NEXT: bsrl %edi, %eax
+; X86-NEXT: bsrl %edx, %eax
; X86-NEXT: xorl $31, %eax
; X86-NEXT: addl $32, %eax
; X86-NEXT: .LBB0_6: # %BB_udiv-special-cases
@@ -104,32 +104,28 @@ define void @f() nounwind {
; X86-NEXT: movsbl %al, %esi
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 128(%esp,%esi), %edi
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 132(%esp,%esi), %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 112(%esp,%esi), %eax
-; X86-NEXT: movl 116(%esp,%esi), %edx
-; X86-NEXT: movl 120(%esp,%esi), %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl 136(%esp,%esi), %eax
+; X86-NEXT: shldl %cl, %edx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %ebx, %eax
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: je .LBB0_11
; X86-NEXT: # %bb.9: # %udiv-preheader
-; X86-NEXT: andl $3, %esi
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -139,14 +135,17 @@ define void @f() nounwind {
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 72(%esp,%eax), %edx
-; X86-NEXT: movl 64(%esp,%eax), %edi
-; X86-NEXT: movl 68(%esp,%eax), %eax
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl 88(%esp,%eax), %edx
+; X86-NEXT: movl 84(%esp,%eax), %edi
+; X86-NEXT: movl %edi, %ebx
; X86-NEXT: shrdl %cl, %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 80(%esp,%eax), %eax
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shrdl %cl, %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
@@ -163,29 +162,30 @@ define void @f() nounwind {
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB0_10: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl $1, %ebx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl %esi, %edx
+; X86-NEXT: shldl $1, %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %edx
; X86-NEXT: andl $2, %edx
; X86-NEXT: shrl %edx
-; X86-NEXT: leal (%edx,%ebx,2), %ebx
+; X86-NEXT: leal (%edx,%esi,2), %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: orl %edi, %esi
+; X86-NEXT: shldl $1, %edx, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl $1, %eax, %edx
-; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %eax, %eax
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $3, %esi
-; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: andl $3, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: sbbl %ebx, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: sbbl %ecx, %edx
; X86-NEXT: shll $30, %edx
@@ -200,10 +200,10 @@ define void @f() nounwind {
; X86-NEXT: movl %edx, %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: subl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: subl %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: andl $3, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index b2ff06798aad7..37dab8a80e0b4 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -13,53 +13,48 @@ define i256 @test1(i256 %a) nounwind {
; ILP-LABEL: test1:
; ILP: # %bb.0:
; ILP-NEXT: movq %rdi, %rax
+; ILP-NEXT: xorps %xmm0, %xmm0
+; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: leal (%rsi,%rsi), %ecx
; ILP-NEXT: addb $3, %cl
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movl %ecx, %edx
; ILP-NEXT: shrb $3, %dl
+; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: andb $24, %dl
; ILP-NEXT: negb %dl
; ILP-NEXT: movsbq %dl, %rdx
-; ILP-NEXT: movq -16(%rsp,%rdx), %rsi
-; ILP-NEXT: movq -8(%rsp,%rdx), %rdi
-; ILP-NEXT: shldq %cl, %rsi, %rdi
+; ILP-NEXT: movq -40(%rsp,%rdx), %rsi
+; ILP-NEXT: movq -24(%rsp,%rdx), %rdi
; ILP-NEXT: movq -32(%rsp,%rdx), %r8
-; ILP-NEXT: movq -24(%rsp,%rdx), %rdx
+; ILP-NEXT: movq -16(%rsp,%rdx), %rdx
; ILP-NEXT: movq %r8, %r9
-; ILP-NEXT: shlq %cl, %r9
-; ILP-NEXT: movq %rdx, %r10
-; ILP-NEXT: shldq %cl, %r8, %r10
-; ILP-NEXT: movq %rdi, 24(%rax)
-; ILP-NEXT: movq %r10, 8(%rax)
-; ILP-NEXT: movq %r9, (%rax)
+; ILP-NEXT: shldq %cl, %rsi, %r9
+; ILP-NEXT: shldq %cl, %rdi, %rdx
; ILP-NEXT: shlq %cl, %rsi
+; ILP-NEXT: movq %rdx, 24(%rax)
+; ILP-NEXT: movq %r9, 8(%rax)
+; ILP-NEXT: movq %rsi, (%rax)
+; ILP-NEXT: shlq %cl, %rdi
; ILP-NEXT: notb %cl
-; ILP-NEXT: shrq %rdx
+; ILP-NEXT: shrq %r8
; ILP-NEXT: # kill: def $cl killed $cl killed $ecx
-; ILP-NEXT: shrq %cl, %rdx
-; ILP-NEXT: orq %rsi, %rdx
-; ILP-NEXT: movq %rdx, 16(%rax)
+; ILP-NEXT: shrq %cl, %r8
+; ILP-NEXT: orq %rdi, %r8
+; ILP-NEXT: movq %r8, 16(%rax)
; ILP-NEXT: retq
;
; HYBRID-LABEL: test1:
; HYBRID: # %bb.0:
; HYBRID-NEXT: movq %rdi, %rax
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT: xorps %xmm0, %xmm0
+; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; HYBRID-NEXT: movq $1, -{{[0-9]+}}(%rsp)
; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; HYBRID-NEXT: leal (%rsi,%rsi), %ecx
; HYBRID-NEXT: addb $3, %cl
; HYBRID-NEXT: movl %ecx, %edx
@@ -67,37 +62,35 @@ define i256 @test1(i256 %a) nounwind {
; HYBRID-NEXT: andb $24, %dl
; HYBRID-NEXT: negb %dl
; HYBRID-NEXT: movsbq %dl, %rdx
-; HYBRID-NEXT: movq -16(%rsp,%rdx), %rsi
-; HYBRID-NEXT: movq -8(%rsp,%rdx), %rdi
-; HYBRID-NEXT: shldq %cl, %rsi, %rdi
-; HYBRID-NEXT: movq %rdi, 24(%rax)
-; HYBRID-NEXT: movq -32(%rsp,%rdx), %rdi
-; HYBRID-NEXT: movq -24(%rsp,%rdx), %rdx
-; HYBRID-NEXT: movq %rdx, %r8
-; HYBRID-NEXT: shldq %cl, %rdi, %r8
-; HYBRID-NEXT: movq %r8, 8(%rax)
-; HYBRID-NEXT: shlq %cl, %rdi
-; HYBRID-NEXT: movq %rdi, (%rax)
+; HYBRID-NEXT: movq -40(%rsp,%rdx), %rsi
+; HYBRID-NEXT: movq -24(%rsp,%rdx), %rdi
+; HYBRID-NEXT: movq -32(%rsp,%rdx), %r8
+; HYBRID-NEXT: movq -16(%rsp,%rdx), %rdx
+; HYBRID-NEXT: shldq %cl, %rdi, %rdx
+; HYBRID-NEXT: movq %rdx, 24(%rax)
+; HYBRID-NEXT: movq %r8, %rdx
+; HYBRID-NEXT: shldq %cl, %rsi, %rdx
+; HYBRID-NEXT: movq %rdx, 8(%rax)
; HYBRID-NEXT: shlq %cl, %rsi
+; HYBRID-NEXT: movq %rsi, (%rax)
+; HYBRID-NEXT: shlq %cl, %rdi
; HYBRID-NEXT: notb %cl
-; HYBRID-NEXT: shrq %rdx
+; HYBRID-NEXT: shrq %r8
; HYBRID-NEXT: # kill: def $cl killed $cl killed $ecx
-; HYBRID-NEXT: shrq %cl, %rdx
-; HYBRID-NEXT: orq %rsi, %rdx
-; HYBRID-NEXT: movq %rdx, 16(%rax)
+; HYBRID-NEXT: shrq %cl, %r8
+; HYBRID-NEXT: orq %rdi, %r8
+; HYBRID-NEXT: movq %r8, 16(%rax)
; HYBRID-NEXT: retq
;
; BURR-LABEL: test1:
; BURR: # %bb.0:
; BURR-NEXT: movq %rdi, %rax
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT: xorps %xmm0, %xmm0
+; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; BURR-NEXT: movq $1, -{{[0-9]+}}(%rsp)
; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; BURR-NEXT: leal (%rsi,%rsi), %ecx
; BURR-NEXT: addb $3, %cl
; BURR-NEXT: movl %ecx, %edx
@@ -105,24 +98,24 @@ define i256 @test1(i256 %a) nounwind {
; BURR-NEXT: andb $24, %dl
; BURR-NEXT: negb %dl
; BURR-NEXT: movsbq %dl, %rdx
-; BURR-NEXT: movq -16(%rsp,%rdx), %rsi
-; BURR-NEXT: movq -8(%rsp,%rdx), %rdi
-; BURR-NEXT: shldq %cl, %rsi, %rdi
-; BURR-NEXT: movq %rdi, 24(%rax)
-; BURR-NEXT: movq -32(%rsp,%rdx), %rdi
-; BURR-NEXT: movq -24(%rsp,%rdx), %rdx
-; BURR-NEXT: movq %rdx, %r8
-; BURR-NEXT: shldq %cl, %rdi, %r8
-; BURR-NEXT: movq %r8, 8(%rax)
-; BURR-NEXT: shlq %cl, %rdi
-; BURR-NEXT: movq %rdi, (%rax)
+; BURR-NEXT: movq -40(%rsp,%rdx), %rsi
+; BURR-NEXT: movq -24(%rsp,%rdx), %rdi
+; BURR-NEXT: movq -32(%rsp,%rdx), %r8
+; BURR-NEXT: movq -16(%rsp,%rdx), %rdx
+; BURR-NEXT: shldq %cl, %rdi, %rdx
+; BURR-NEXT: movq %rdx, 24(%rax)
+; BURR-NEXT: movq %r8, %rdx
+; BURR-NEXT: shldq %cl, %rsi, %rdx
+; BURR-NEXT: movq %rdx, 8(%rax)
; BURR-NEXT: shlq %cl, %rsi
+; BURR-NEXT: movq %rsi, (%rax)
+; BURR-NEXT: shlq %cl, %rdi
; BURR-NEXT: notb %cl
-; BURR-NEXT: shrq %rdx
+; BURR-NEXT: shrq %r8
; BURR-NEXT: # kill: def $cl killed $cl killed $ecx
-; BURR-NEXT: shrq %cl, %rdx
-; BURR-NEXT: orq %rsi, %rdx
-; BURR-NEXT: movq %rdx, 16(%rax)
+; BURR-NEXT: shrq %cl, %r8
+; BURR-NEXT: orq %rdi, %r8
+; BURR-NEXT: movq %r8, 16(%rax)
; BURR-NEXT: retq
;
; SRC-LABEL: test1:
@@ -130,36 +123,33 @@ define i256 @test1(i256 %a) nounwind {
; SRC-NEXT: movq %rdi, %rax
; SRC-NEXT: leal (%rsi,%rsi), %edx
; SRC-NEXT: addb $3, %dl
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT: xorps %xmm0, %xmm0
+; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SRC-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; SRC-NEXT: movl %edx, %ecx
; SRC-NEXT: shrb $3, %cl
; SRC-NEXT: andb $24, %cl
; SRC-NEXT: negb %cl
; SRC-NEXT: movsbq %cl, %rsi
-; SRC-NEXT: movq -16(%rsp,%rsi), %rdi
-; SRC-NEXT: movq %rdi, %r8
+; SRC-NEXT: movq -40(%rsp,%rsi), %rdi
+; SRC-NEXT: movq -24(%rsp,%rsi), %r8
+; SRC-NEXT: movq %r8, %r9
; SRC-NEXT: movl %edx, %ecx
-; SRC-NEXT: shlq %cl, %r8
+; SRC-NEXT: shlq %cl, %r9
+; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; SRC-NEXT: notb %cl
-; SRC-NEXT: movq -32(%rsp,%rsi), %r9
-; SRC-NEXT: movq -24(%rsp,%rsi), %r10
+; SRC-NEXT: movq -32(%rsp,%rsi), %r10
+; SRC-NEXT: movq -16(%rsp,%rsi), %rsi
; SRC-NEXT: movq %r10, %r11
; SRC-NEXT: shrq %r11
; SRC-NEXT: shrq %cl, %r11
-; SRC-NEXT: orq %r8, %r11
-; SRC-NEXT: movq -8(%rsp,%rsi), %rsi
+; SRC-NEXT: orq %r9, %r11
; SRC-NEXT: movl %edx, %ecx
-; SRC-NEXT: shldq %cl, %rdi, %rsi
-; SRC-NEXT: movq %r9, %rdi
+; SRC-NEXT: shldq %cl, %rdi, %r10
+; SRC-NEXT: shldq %cl, %r8, %rsi
; SRC-NEXT: shlq %cl, %rdi
-; SRC-NEXT: shldq %cl, %r9, %r10
; SRC-NEXT: movq %rsi, 24(%rax)
; SRC-NEXT: movq %r10, 8(%rax)
; SRC-NEXT: movq %rdi, (%rax)
@@ -176,24 +166,22 @@ define i256 @test1(i256 %a) nounwind {
; LIN-NEXT: andb $24, %cl
; LIN-NEXT: negb %cl
; LIN-NEXT: movsbq %cl, %rsi
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT: xorps %xmm0, %xmm0
+; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; LIN-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq -32(%rsp,%rsi), %rdi
+; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT: movq -40(%rsp,%rsi), %rdi
; LIN-NEXT: movq %rdi, %r8
; LIN-NEXT: movl %edx, %ecx
; LIN-NEXT: shlq %cl, %r8
; LIN-NEXT: movq %r8, (%rax)
-; LIN-NEXT: movq -24(%rsp,%rsi), %r8
+; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT: movq -32(%rsp,%rsi), %r8
; LIN-NEXT: movq %r8, %r9
; LIN-NEXT: shldq %cl, %rdi, %r9
; LIN-NEXT: movq %r9, 8(%rax)
-; LIN-NEXT: movq -16(%rsp,%rsi), %rdi
+; LIN-NEXT: movq -24(%rsp,%rsi), %rdi
; LIN-NEXT: movq %rdi, %r9
; LIN-NEXT: shlq %cl, %r9
; LIN-NEXT: shrq %r8
@@ -201,7 +189,7 @@ define i256 @test1(i256 %a) nounwind {
; LIN-NEXT: shrq %cl, %r8
; LIN-NEXT: orq %r9, %r8
; LIN-NEXT: movq %r8, 16(%rax)
-; LIN-NEXT: movq -8(%rsp,%rsi), %rsi
+; LIN-NEXT: movq -16(%rsp,%rsi), %rsi
; LIN-NEXT: movl %edx, %ecx
; LIN-NEXT: shldq %cl, %rdi, %rsi
; LIN-NEXT: movq %rsi, 24(%rax)
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index ed1ba5c59e500..e6d716afdbe8b 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -10,43 +10,44 @@ define void @test_lshr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-LABEL: test_lshr_i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $32, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $48, %esp
+; i686-NEXT: movl 24(%ebp), %ecx
+; i686-NEXT: movl 16(%ebp), %eax
+; i686-NEXT: movl 20(%ebp), %edx
+; i686-NEXT: movl 8(%ebp), %esi
+; i686-NEXT: movl 12(%ebp), %edi
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edx, (%esp)
+; i686-NEXT: movl %esi, (%esp)
+; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %ecx, %eax
+; i686-NEXT: shrb $3, %al
+; i686-NEXT: andb $12, %al
+; i686-NEXT: movzbl %al, %edi
+; i686-NEXT: movl 4(%esp,%edi), %edx
+; i686-NEXT: movl (%esp,%edi), %eax
+; i686-NEXT: shrdl %cl, %edx, %eax
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 8(%esp,%edi), %esi
+; i686-NEXT: shrdl %cl, %esi, %edx
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ecx, %edx
-; i686-NEXT: shrb $3, %dl
-; i686-NEXT: andb $12, %dl
-; i686-NEXT: movzbl %dl, %ebx
-; i686-NEXT: movl 8(%esp,%ebx), %esi
-; i686-NEXT: movl (%esp,%ebx), %edx
-; i686-NEXT: movl 4(%esp,%ebx), %ebp
-; i686-NEXT: movl %ebp, %edi
-; i686-NEXT: shrdl %cl, %esi, %edi
-; i686-NEXT: movl 12(%esp,%ebx), %ebx
-; i686-NEXT: shrdl %cl, %ebx, %esi
-; i686-NEXT: shrdl %cl, %ebp, %edx
+; i686-NEXT: movl 12(%esp,%edi), %edi
+; i686-NEXT: shrdl %cl, %edi, %esi
+; i686-NEXT: movl 40(%ebp), %ebx
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shrl %cl, %ebx
-; i686-NEXT: movl %ebx, 12(%eax)
-; i686-NEXT: movl %esi, 8(%eax)
-; i686-NEXT: movl %edi, 4(%eax)
-; i686-NEXT: movl %edx, (%eax)
-; i686-NEXT: addl $32, %esp
+; i686-NEXT: shrl %cl, %edi
+; i686-NEXT: movl %edi, 12(%ebx)
+; i686-NEXT: movl %esi, 8(%ebx)
+; i686-NEXT: movl %edx, 4(%ebx)
+; i686-NEXT: movl %eax, (%ebx)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -75,44 +76,45 @@ define void @test_ashr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-LABEL: test_ashr_i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $32, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $48, %esp
+; i686-NEXT: movl 24(%ebp), %ecx
+; i686-NEXT: movl 16(%ebp), %eax
+; i686-NEXT: movl 20(%ebp), %esi
+; i686-NEXT: movl 8(%ebp), %edx
+; i686-NEXT: movl 12(%ebp), %edi
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: sarl $31, %esi
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edx, (%esp)
-; i686-NEXT: sarl $31, %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ecx, %edx
-; i686-NEXT: shrb $3, %dl
-; i686-NEXT: andb $12, %dl
-; i686-NEXT: movzbl %dl, %ebx
-; i686-NEXT: movl 8(%esp,%ebx), %esi
-; i686-NEXT: movl (%esp,%ebx), %edx
-; i686-NEXT: movl 4(%esp,%ebx), %ebp
-; i686-NEXT: movl %ebp, %edi
-; i686-NEXT: shrdl %cl, %esi, %edi
-; i686-NEXT: movl 12(%esp,%ebx), %ebx
-; i686-NEXT: shrdl %cl, %ebx, %esi
-; i686-NEXT: shrdl %cl, %ebp, %edx
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %ecx, %eax
+; i686-NEXT: shrb $3, %al
+; i686-NEXT: andb $12, %al
+; i686-NEXT: movzbl %al, %edi
+; i686-NEXT: movl 4(%esp,%edi), %edx
+; i686-NEXT: movl (%esp,%edi), %eax
+; i686-NEXT: shrdl %cl, %edx, %eax
+; i686-NEXT: movl 8(%esp,%edi), %esi
+; i686-NEXT: shrdl %cl, %esi, %edx
+; i686-NEXT: movl 12(%esp,%edi), %edi
+; i686-NEXT: shrdl %cl, %edi, %esi
+; i686-NEXT: movl 40(%ebp), %ebx
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: sarl %cl, %ebx
-; i686-NEXT: movl %ebx, 12(%eax)
-; i686-NEXT: movl %esi, 8(%eax)
-; i686-NEXT: movl %edi, 4(%eax)
-; i686-NEXT: movl %edx, (%eax)
-; i686-NEXT: addl $32, %esp
+; i686-NEXT: sarl %cl, %edi
+; i686-NEXT: movl %edi, 12(%ebx)
+; i686-NEXT: movl %esi, 8(%ebx)
+; i686-NEXT: movl %edx, 4(%ebx)
+; i686-NEXT: movl %eax, (%ebx)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -142,45 +144,49 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-LABEL: test_shl_i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $32, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $64, %esp
+; i686-NEXT: movl 24(%ebp), %ecx
+; i686-NEXT: movl 16(%ebp), %eax
+; i686-NEXT: movl 20(%ebp), %edx
+; i686-NEXT: movl 8(%ebp), %esi
+; i686-NEXT: movl 12(%ebp), %edi
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %ecx, %eax
+; i686-NEXT: shrb $3, %al
+; i686-NEXT: andb $12, %al
+; i686-NEXT: negb %al
+; i686-NEXT: movsbl %al, %esi
+; i686-NEXT: movl 32(%esp,%esi), %eax
+; i686-NEXT: movl 36(%esp,%esi), %edx
+; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: shldl %cl, %eax, %edx
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl $0, (%esp)
-; i686-NEXT: movl %ecx, %edx
-; i686-NEXT: shrb $3, %dl
-; i686-NEXT: andb $12, %dl
-; i686-NEXT: negb %dl
-; i686-NEXT: movsbl %dl, %edi
-; i686-NEXT: movl 16(%esp,%edi), %edx
-; i686-NEXT: movl 20(%esp,%edi), %esi
-; i686-NEXT: movl 24(%esp,%edi), %ebx
-; i686-NEXT: movl %ebx, %ebp
-; i686-NEXT: shldl %cl, %esi, %ebp
-; i686-NEXT: movl 28(%esp,%edi), %edi
+; i686-NEXT: movl 40(%esp,%esi), %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; i686-NEXT: shldl %cl, %ebx, %edi
-; i686-NEXT: movl %edi, 12(%eax)
-; i686-NEXT: movl %ebp, 8(%eax)
-; i686-NEXT: movl %edx, %edi
-; i686-NEXT: shll %cl, %edi
+; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 44(%esp,%esi), %esi
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; i686-NEXT: shldl %cl, %ebx, %esi
+; i686-NEXT: movl 40(%ebp), %ebx
+; i686-NEXT: movl %esi, 12(%ebx)
+; i686-NEXT: movl %edi, 8(%ebx)
+; i686-NEXT: movl %edx, 4(%ebx)
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shldl %cl, %edx, %esi
-; i686-NEXT: movl %esi, 4(%eax)
-; i686-NEXT: movl %edi, (%eax)
-; i686-NEXT: addl $32, %esp
+; i686-NEXT: shll %cl, %eax
+; i686-NEXT: movl %eax, (%ebx)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -243,88 +249,87 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-LABEL: test_lshr_v2i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $92, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $112, %esp
+; i686-NEXT: movl 40(%ebp), %edx
+; i686-NEXT: movl 32(%ebp), %eax
+; i686-NEXT: movl 36(%ebp), %ecx
+; i686-NEXT: movl 24(%ebp), %esi
+; i686-NEXT: movl 12(%ebp), %edi
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 8(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 16(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 20(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 28(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edx, %ecx
+; i686-NEXT: andl $31, %ecx
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: movl 36(%esp,%edx), %esi
+; i686-NEXT: movl 32(%esp,%edx), %eax
+; i686-NEXT: shrdl %cl, %esi, %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 40(%esp,%edx), %eax
+; i686-NEXT: shrdl %cl, %eax, %esi
+; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 44(%esp,%edx), %edx
+; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %ecx, %edi
+; i686-NEXT: shrdl %cl, %edx, %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 56(%ebp), %edx
; i686-NEXT: movl %edx, %eax
; i686-NEXT: andl $31, %eax
; i686-NEXT: shrl $3, %edx
; i686-NEXT: andl $12, %edx
-; i686-NEXT: movl 36(%esp,%edx), %edi
-; i686-NEXT: movl 28(%esp,%edx), %ecx
-; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 32(%esp,%edx), %ebx
-; i686-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 68(%esp,%edx), %ebx
+; i686-NEXT: movl 64(%esp,%edx), %esi
; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrdl %cl, %edi, %ebx
-; i686-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 40(%esp,%edx), %edx
-; i686-NEXT: movl %edx, (%esp) # 4-byte Spill
-; i686-NEXT: shrdl %cl, %edx, %edi
-; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: shrdl %cl, %ebx, %esi
+; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 72(%esp,%edx), %esi
+; i686-NEXT: shrdl %cl, %esi, %ebx
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %esi, %edx
-; i686-NEXT: andl $31, %edx
-; i686-NEXT: shrl $3, %esi
-; i686-NEXT: andl $12, %esi
-; i686-NEXT: movl 68(%esp,%esi), %ebp
-; i686-NEXT: movl 64(%esp,%esi), %edi
-; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: shrdl %cl, %ebp, %edi
-; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 60(%esp,%esi), %edi
-; i686-NEXT: movl 72(%esp,%esi), %esi
-; i686-NEXT: shrdl %cl, %esi, %ebp
+; i686-NEXT: movl 76(%esp,%edx), %edx
+; i686-NEXT: shrdl %cl, %edx, %esi
+; i686-NEXT: movl %edi, %ecx
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; i686-NEXT: shrl %cl, %edi
; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT: shrdl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: shrdl %cl, %eax, %edi
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl %esi, 28(%eax)
-; i686-NEXT: movl %ebp, 24(%eax)
+; i686-NEXT: shrl %cl, %edx
+; i686-NEXT: movl 72(%ebp), %eax
+; i686-NEXT: movl %edx, 28(%eax)
+; i686-NEXT: movl %esi, 24(%eax)
+; i686-NEXT: movl %ebx, 20(%eax)
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT: movl %ecx, 20(%eax)
-; i686-NEXT: movl %edi, 16(%eax)
-; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT: movl %ecx, 12(%eax)
+; i686-NEXT: movl %ecx, 16(%eax)
+; i686-NEXT: movl %edi, 12(%eax)
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: movl %ecx, 8(%eax)
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: movl %ecx, 4(%eax)
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: movl %ecx, (%eax)
-; i686-NEXT: addl $92, %esp
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -365,82 +370,80 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-LABEL: test_ashr_v2i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $92, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: sarl $31, %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $112, %esp
+; i686-NEXT: movl 40(%ebp), %edx
+; i686-NEXT: movl 32(%ebp), %eax
+; i686-NEXT: movl 36(%ebp), %esi
+; i686-NEXT: movl 24(%ebp), %ecx
+; i686-NEXT: movl 12(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 8(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 16(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 20(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: sarl $31, %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 28(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: sarl $31, %esi
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; i686-NEXT: sarl $31, %eax
-; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edx, %ecx
+; i686-NEXT: andl $31, %ecx
+; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: movl 36(%esp,%edx), %esi
+; i686-NEXT: movl 32(%esp,%edx), %eax
+; i686-NEXT: shrdl %cl, %esi, %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 40(%esp,%edx), %eax
+; i686-NEXT: shrdl %cl, %eax, %esi
+; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 44(%esp,%edx), %edx
+; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: shrdl %cl, %edx, %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 56(%ebp), %edx
; i686-NEXT: movl %edx, %eax
; i686-NEXT: andl $31, %eax
; i686-NEXT: shrl $3, %edx
; i686-NEXT: andl $12, %edx
-; i686-NEXT: movl 36(%esp,%edx), %edi
-; i686-NEXT: movl 28(%esp,%edx), %ecx
-; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 32(%esp,%edx), %ebx
-; i686-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 68(%esp,%edx), %ebx
+; i686-NEXT: movl 64(%esp,%edx), %edi
; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrdl %cl, %edi, %ebx
-; i686-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 40(%esp,%edx), %edx
-; i686-NEXT: movl %edx, (%esp) # 4-byte Spill
-; i686-NEXT: shrdl %cl, %edx, %edi
-; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %esi, %edx
-; i686-NEXT: andl $31, %edx
-; i686-NEXT: shrl $3, %esi
-; i686-NEXT: andl $12, %esi
-; i686-NEXT: movl 68(%esp,%esi), %ebp
-; i686-NEXT: movl 64(%esp,%esi), %edi
-; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: shrdl %cl, %ebp, %edi
-; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 60(%esp,%esi), %edi
-; i686-NEXT: movl 72(%esp,%esi), %esi
-; i686-NEXT: shrdl %cl, %esi, %ebp
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT: shrdl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: sarl %cl, (%esp) # 4-byte Folded Spill
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: shrdl %cl, %eax, %edi
-; i686-NEXT: sarl %cl, %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl %esi, 28(%eax)
-; i686-NEXT: movl %ebp, 24(%eax)
+; i686-NEXT: shrdl %cl, %ebx, %edi
+; i686-NEXT: movl 72(%esp,%edx), %esi
+; i686-NEXT: shrdl %cl, %esi, %ebx
+; i686-NEXT: movl 76(%esp,%edx), %edx
+; i686-NEXT: shrdl %cl, %edx, %esi
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT: movl %ecx, 20(%eax)
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: sarl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: sarl %cl, %edx
+; i686-NEXT: movl 72(%ebp), %eax
+; i686-NEXT: movl %edx, 28(%eax)
+; i686-NEXT: movl %esi, 24(%eax)
+; i686-NEXT: movl %ebx, 20(%eax)
; i686-NEXT: movl %edi, 16(%eax)
-; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: movl %ecx, 12(%eax)
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: movl %ecx, 8(%eax)
@@ -448,7 +451,7 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: movl %ecx, 4(%eax)
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: movl %ecx, (%eax)
-; i686-NEXT: addl $92, %esp
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -492,94 +495,98 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
; i686-LABEL: test_shl_v2i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $100, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $128, %esp
+; i686-NEXT: movl 40(%ebp), %edi
+; i686-NEXT: movl 32(%ebp), %ecx
+; i686-NEXT: movl 36(%ebp), %eax
+; i686-NEXT: movl 24(%ebp), %edx
+; i686-NEXT: movl 8(%ebp), %esi
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 12(%ebp), %esi
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 16(%ebp), %esi
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 28(%ebp), %esi
; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ecx, %ebx
-; i686-NEXT: shrl $3, %ebx
-; i686-NEXT: andl $12, %ebx
-; i686-NEXT: leal {{[0-9]+}}(%esp), %edx
-; i686-NEXT: subl %ebx, %edx
-; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 20(%ebp), %edx
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, %ecx
+; i686-NEXT: movl %edi, %edx
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: leal {{[0-9]+}}(%esp), %eax
+; i686-NEXT: subl %edx, %eax
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl (%edx), %esi
+; i686-NEXT: movl (%eax), %esi
; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 4(%edx), %esi
-; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 8(%edx), %edi
-; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 4(%eax), %edx
; i686-NEXT: andl $31, %ecx
; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: movl %edx, %edi
; i686-NEXT: shldl %cl, %esi, %edi
; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %eax, %ebp
-; i686-NEXT: shrl $3, %ebp
-; i686-NEXT: andl $12, %ebp
-; i686-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: subl %ebp, %ecx
-; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 8(%eax), %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: shldl %cl, %edx, %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 56(%ebp), %eax
+; i686-NEXT: movl %eax, %edx
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: leal {{[0-9]+}}(%esp), %esi
+; i686-NEXT: subl %edx, %esi
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl (%esi), %edi
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl (%ecx), %edx
-; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 4(%ecx), %edi
-; i686-NEXT: movl 8(%ecx), %esi
-; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: andl $31, %eax
-; i686-NEXT: movl %eax, (%esp) # 4-byte Spill
+; i686-NEXT: movl 4(%esi), %ecx
+; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shldl %cl, %edi, %esi
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, %eax
+; i686-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 8(%esi), %esi
+; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; i686-NEXT: shldl %cl, %ebx, %esi
+; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT: shll %cl, %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: negl %ebx
-; i686-NEXT: movl 64(%esp,%ebx), %ebx
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: shldl %cl, %eax, %ebx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, %edx
-; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shldl %cl, %eax, %edi
-; i686-NEXT: negl %ebp
-; i686-NEXT: movl 96(%esp,%ebp), %ebp
-; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT: shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: negl %ecx
+; i686-NEXT: movl 76(%esp,%ecx), %ebx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; i686-NEXT: shldl %cl, %esi, %ebx
+; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: shll %cl, %edi
+; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT: negl %edx
+; i686-NEXT: movl 108(%esp,%edx), %edx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: shldl %cl, %eax, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl %ebp, 28(%eax)
-; i686-NEXT: movl %esi, 24(%eax)
-; i686-NEXT: movl %edi, 20(%eax)
-; i686-NEXT: movl %edx, 16(%eax)
+; i686-NEXT: shldl %cl, %eax, %edx
+; i686-NEXT: movl 72(%ebp), %eax
+; i686-NEXT: movl %edx, 28(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 24(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 20(%eax)
+; i686-NEXT: movl %edi, 16(%eax)
; i686-NEXT: movl %ebx, 12(%eax)
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: movl %ecx, 8(%eax)
@@ -587,7 +594,7 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
; i686-NEXT: movl %ecx, 4(%eax)
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: movl %ecx, (%eax)
-; i686-NEXT: addl $100, %esp
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index bf159acc43f91..f015dfddcfcaf 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -8,74 +8,74 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-LABEL: shift1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %ebp
+; CHECK-NEXT: movl %esp, %ebp
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $80, %esp
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; CHECK-NEXT: andl $-16, %esp
+; CHECK-NEXT: subl $96, %esp
+; CHECK-NEXT: movl 40(%ebp), %ecx
+; CHECK-NEXT: movl 16(%ebp), %edx
+; CHECK-NEXT: movl 20(%ebp), %eax
+; CHECK-NEXT: movl 32(%ebp), %esi
+; CHECK-NEXT: movl 28(%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl 12(%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl 24(%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl 8(%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl 36(%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: sarl $31, %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT: sarl $31, %esi
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: shrb $5, %al
-; CHECK-NEXT: movzbl %al, %ebp
-; CHECK-NEXT: movl 24(%esp,%ebp,4), %eax
-; CHECK-NEXT: movl 20(%esp,%ebp,4), %edx
-; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: shrdl %cl, %eax, %edx
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: movl 20(%esp,%eax,4), %edx
+; CHECK-NEXT: movl 16(%esp,%eax,4), %esi
+; CHECK-NEXT: shrdl %cl, %edx, %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 24(%esp,%eax,4), %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: shrdl %cl, %esi, %edx
; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 28(%esp,%ebp,4), %edx
-; CHECK-NEXT: shrdl %cl, %edx, %eax
-; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 32(%esp,%ebp,4), %ebx
-; CHECK-NEXT: shrdl %cl, %ebx, %edx
-; CHECK-NEXT: movl %edx, (%esp) # 4-byte Spill
-; CHECK-NEXT: movl 36(%esp,%ebp,4), %edx
+; CHECK-NEXT: movl 36(%esp,%eax,4), %ebx
+; CHECK-NEXT: movl 32(%esp,%eax,4), %edi
+; CHECK-NEXT: movl 40(%esp,%eax,4), %edx
+; CHECK-NEXT: movl 28(%esp,%eax,4), %esi
+; CHECK-NEXT: movl 44(%esp,%eax,4), %eax
+; CHECK-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT: shrdl %cl, %edi, %esi
+; CHECK-NEXT: shrdl %cl, %ebx, %edi
; CHECK-NEXT: shrdl %cl, %edx, %ebx
-; CHECK-NEXT: movl 40(%esp,%ebp,4), %eax
; CHECK-NEXT: shrdl %cl, %eax, %edx
-; CHECK-NEXT: movl 16(%esp,%ebp,4), %esi
-; CHECK-NEXT: movl 44(%esp,%ebp,4), %ebp
-; CHECK-NEXT: shrdl %cl, %ebp, %eax
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; CHECK-NEXT: shrdl %cl, %edi, %esi
; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT: sarl %cl, %ebp
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl %ebp, 28(%ecx)
-; CHECK-NEXT: movl %eax, 24(%ecx)
-; CHECK-NEXT: movl %edx, 20(%ecx)
-; CHECK-NEXT: movl %ebx, 16(%ecx)
-; CHECK-NEXT: movl (%esp), %eax # 4-byte Reload
-; CHECK-NEXT: movl %eax, 12(%ecx)
+; CHECK-NEXT: sarl %cl, %eax
+; CHECK-NEXT: movl 72(%ebp), %ecx
+; CHECK-NEXT: movl %eax, 28(%ecx)
+; CHECK-NEXT: movl %edx, 24(%ecx)
+; CHECK-NEXT: movl %ebx, 20(%ecx)
+; CHECK-NEXT: movl %edi, 16(%ecx)
+; CHECK-NEXT: movl %esi, 12(%ecx)
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; CHECK-NEXT: movl %eax, 8(%ecx)
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; CHECK-NEXT: movl %eax, 4(%ecx)
-; CHECK-NEXT: movl %esi, (%ecx)
-; CHECK-NEXT: addl $80, %esp
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, (%ecx)
+; CHECK-NEXT: leal -12(%ebp), %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
@@ -102,13 +102,13 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-X64-O0-NEXT: shrb $6, %dl
; CHECK-X64-O0-NEXT: movzbl %dl, %edx
; CHECK-X64-O0-NEXT: movl %edx, %edi
-; CHECK-X64-O0-NEXT: movq -48(%rsp,%rdi,8), %rsi
-; CHECK-X64-O0-NEXT: movq -64(%rsp,%rdi,8), %r8
-; CHECK-X64-O0-NEXT: movq -56(%rsp,%rdi,8), %r9
+; CHECK-X64-O0-NEXT: movq -56(%rsp,%rdi,8), %rsi
+; CHECK-X64-O0-NEXT: movq -72(%rsp,%rdi,8), %r8
+; CHECK-X64-O0-NEXT: movq -64(%rsp,%rdi,8), %r9
; CHECK-X64-O0-NEXT: movq %r9, %rdx
; CHECK-X64-O0-NEXT: shrdq %cl, %rsi, %rdx
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: movq -40(%rsp,%rdi,8), %rdi
+; CHECK-X64-O0-NEXT: movq -48(%rsp,%rdi,8), %rdi
; CHECK-X64-O0-NEXT: shrdq %cl, %rdi, %rsi
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
; CHECK-X64-O0-NEXT: shrdq %cl, %r9, %r8
@@ -126,30 +126,29 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-X64-O2: # %bb.0: # %entry
; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: sarq $63, %rcx
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movl %r8d, %ecx
; CHECK-X64-O2-NEXT: shrb $6, %cl
-; CHECK-X64-O2-NEXT: movzbl %cl, %edx
-; CHECK-X64-O2-NEXT: movq -48(%rsp,%rdx,8), %rsi
-; CHECK-X64-O2-NEXT: movq -64(%rsp,%rdx,8), %rdi
-; CHECK-X64-O2-NEXT: movq -56(%rsp,%rdx,8), %r9
-; CHECK-X64-O2-NEXT: movq %r9, %r10
+; CHECK-X64-O2-NEXT: movzbl %cl, %ecx
+; CHECK-X64-O2-NEXT: movq -64(%rsp,%rcx,8), %rdx
+; CHECK-X64-O2-NEXT: movq -48(%rsp,%rcx,8), %rsi
+; CHECK-X64-O2-NEXT: movq -72(%rsp,%rcx,8), %rdi
+; CHECK-X64-O2-NEXT: movq -56(%rsp,%rcx,8), %r9
; CHECK-X64-O2-NEXT: movl %r8d, %ecx
-; CHECK-X64-O2-NEXT: shrdq %cl, %rsi, %r10
-; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx,8), %rdx
-; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %rsi
-; CHECK-X64-O2-NEXT: shrdq %cl, %r9, %rdi
-; CHECK-X64-O2-NEXT: sarq %cl, %rdx
-; CHECK-X64-O2-NEXT: movq %rdx, 24(%rax)
-; CHECK-X64-O2-NEXT: movq %rsi, 16(%rax)
-; CHECK-X64-O2-NEXT: movq %r10, 8(%rax)
+; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %rdi
+; CHECK-X64-O2-NEXT: shrdq %cl, %r9, %rdx
+; CHECK-X64-O2-NEXT: shrdq %cl, %rsi, %r9
+; CHECK-X64-O2-NEXT: sarq %cl, %rsi
+; CHECK-X64-O2-NEXT: movq %rsi, 24(%rax)
+; CHECK-X64-O2-NEXT: movq %r9, 16(%rax)
+; CHECK-X64-O2-NEXT: movq %rdx, 8(%rax)
; CHECK-X64-O2-NEXT: movq %rdi, (%rax)
; CHECK-X64-O2-NEXT: retq
entry:
@@ -162,17 +161,13 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-LABEL: shift2:
; CHECK: # %bb.0:
; CHECK-NEXT: pushl %ebp
+; CHECK-NEXT: movl %esp, %ebp
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $80, %esp
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT: andl $-16, %esp
+; CHECK-NEXT: subl $112, %esp
+; CHECK-NEXT: movl 12(%ebp), %ecx
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -181,54 +176,69 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: shrb $3, %al
; CHECK-NEXT: andb $28, %al
; CHECK-NEXT: negb %al
; CHECK-NEXT: movsbl %al, %eax
-; CHECK-NEXT: movl 52(%esp,%eax), %esi
+; CHECK-NEXT: movl 64(%esp,%eax), %esi
; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 56(%esp,%eax), %edx
+; CHECK-NEXT: movl 68(%esp,%eax), %edx
; CHECK-NEXT: movl %edx, %edi
; CHECK-NEXT: shldl %cl, %esi, %edi
; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 60(%esp,%eax), %esi
-; CHECK-NEXT: movl %esi, %edi
-; CHECK-NEXT: shldl %cl, %edx, %edi
-; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 64(%esp,%eax), %edx
-; CHECK-NEXT: movl %edx, %ebp
-; CHECK-NEXT: shldl %cl, %esi, %ebp
-; CHECK-NEXT: movl 68(%esp,%eax), %esi
-; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl 80(%esp,%eax), %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 84(%esp,%eax), %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl 72(%esp,%eax), %ebx
+; CHECK-NEXT: movl %ebx, %esi
+; CHECK-NEXT: shldl %cl, %edx, %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 88(%esp,%eax), %esi
+; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl 76(%esp,%eax), %edi
+; CHECK-NEXT: movl 92(%esp,%eax), %eax
+; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: shldl %cl, %ebx, %eax
+; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shldl %cl, %edi, %eax
+; CHECK-NEXT: movl %eax, (%esp) # 4-byte Spill
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, %ebx
; CHECK-NEXT: shldl %cl, %edx, %ebx
-; CHECK-NEXT: movl 72(%esp,%eax), %edi
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: shldl %cl, %esi, %edx
-; CHECK-NEXT: movl 48(%esp,%eax), %esi
-; CHECK-NEXT: movl %esi, (%esp) # 4-byte Spill
-; CHECK-NEXT: movl 76(%esp,%eax), %esi
-; CHECK-NEXT: shldl %cl, %edi, %esi
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl %esi, 28(%eax)
-; CHECK-NEXT: movl %edx, 24(%eax)
-; CHECK-NEXT: movl %ebx, 20(%eax)
-; CHECK-NEXT: movl %ebp, 16(%eax)
+; CHECK-NEXT: movl %esi, %edi
+; CHECK-NEXT: shldl %cl, %eax, %edi
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: shldl %cl, %esi, %eax
+; CHECK-NEXT: movl 8(%ebp), %esi
+; CHECK-NEXT: movl %eax, 28(%esi)
+; CHECK-NEXT: movl %edi, 24(%esi)
+; CHECK-NEXT: movl %ebx, 20(%esi)
+; CHECK-NEXT: movl (%esp), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, 16(%esi)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, 12(%esi)
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT: movl %edx, 12(%eax)
+; CHECK-NEXT: movl %edx, 8(%esi)
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT: movl %edx, 8(%eax)
-; CHECK-NEXT: movl (%esp), %edi # 4-byte Reload
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: shll %cl, %edx
+; CHECK-NEXT: movl %edx, 4(%esi)
; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-NEXT: shldl %cl, %edi, %esi
-; CHECK-NEXT: movl %esi, 4(%eax)
-; CHECK-NEXT: movl %edx, (%eax)
-; CHECK-NEXT: addl $80, %esp
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT: shll %cl, %edx
+; CHECK-NEXT: movl %edx, (%esi)
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: leal -12(%ebp), %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
@@ -253,13 +263,13 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-X64-O0-NEXT: andb $24, %dl
; CHECK-X64-O0-NEXT: negb %dl
; CHECK-X64-O0-NEXT: movsbq %dl, %r8
-; CHECK-X64-O0-NEXT: movq -32(%rsp,%r8), %r9
-; CHECK-X64-O0-NEXT: movq -24(%rsp,%r8), %rdx
-; CHECK-X64-O0-NEXT: movq -16(%rsp,%r8), %r10
+; CHECK-X64-O0-NEXT: movq -40(%rsp,%r8), %r9
+; CHECK-X64-O0-NEXT: movq -32(%rsp,%r8), %rdx
+; CHECK-X64-O0-NEXT: movq -24(%rsp,%r8), %r10
; CHECK-X64-O0-NEXT: movq %r10, %rsi
; CHECK-X64-O0-NEXT: shldq %cl, %rdx, %rsi
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: movq -8(%rsp,%r8), %r8
+; CHECK-X64-O0-NEXT: movq -16(%rsp,%r8), %r8
; CHECK-X64-O0-NEXT: shldq %cl, %r10, %r8
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
; CHECK-X64-O0-NEXT: movq %r9, %r10
@@ -278,34 +288,32 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-X64-O2: # %bb.0:
; CHECK-X64-O2-NEXT: movq %rsi, %rcx
; CHECK-X64-O2-NEXT: movq %rdi, %rax
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: xorps %xmm0, %xmm0
+; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq $1, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movl %ecx, %edx
; CHECK-X64-O2-NEXT: shrb $3, %dl
; CHECK-X64-O2-NEXT: andb $24, %dl
; CHECK-X64-O2-NEXT: negb %dl
; CHECK-X64-O2-NEXT: movsbq %dl, %rdx
-; CHECK-X64-O2-NEXT: movq -32(%rsp,%rdx), %rsi
+; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx), %rsi
; CHECK-X64-O2-NEXT: movq -24(%rsp,%rdx), %rdi
-; CHECK-X64-O2-NEXT: movq -16(%rsp,%rdx), %r8
+; CHECK-X64-O2-NEXT: movq -32(%rsp,%rdx), %r8
+; CHECK-X64-O2-NEXT: movq -16(%rsp,%rdx), %rdx
; CHECK-X64-O2-NEXT: movq %r8, %r9
-; CHECK-X64-O2-NEXT: shldq %cl, %rdi, %r9
-; CHECK-X64-O2-NEXT: movq -8(%rsp,%rdx), %rdx
-; CHECK-X64-O2-NEXT: shldq %cl, %r8, %rdx
-; CHECK-X64-O2-NEXT: movq %rsi, %r8
-; CHECK-X64-O2-NEXT: shlq %cl, %r8
+; CHECK-X64-O2-NEXT: shldq %cl, %rsi, %r9
+; CHECK-X64-O2-NEXT: movq %rdi, %r10
+; CHECK-X64-O2-NEXT: shldq %cl, %r8, %r10
+; CHECK-X64-O2-NEXT: shldq %cl, %rdi, %rdx
; CHECK-X64-O2-NEXT: # kill: def $cl killed $cl killed $rcx
-; CHECK-X64-O2-NEXT: shldq %cl, %rsi, %rdi
+; CHECK-X64-O2-NEXT: shlq %cl, %rsi
; CHECK-X64-O2-NEXT: movq %rdx, 24(%rax)
-; CHECK-X64-O2-NEXT: movq %r9, 16(%rax)
-; CHECK-X64-O2-NEXT: movq %rdi, 8(%rax)
-; CHECK-X64-O2-NEXT: movq %r8, (%rax)
+; CHECK-X64-O2-NEXT: movq %r10, 16(%rax)
+; CHECK-X64-O2-NEXT: movq %r9, 8(%rax)
+; CHECK-X64-O2-NEXT: movq %rsi, (%rax)
; CHECK-X64-O2-NEXT: retq
{
%b = shl i256 1, %c ; %c must not be a constant
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index e5affd86312ef..7d12a8166d861 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -646,76 +646,784 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
;
-; X86-SSE2-LABEL: lshr_16bytes:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebx
-; X86-SSE2-NEXT: pushl %edi
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $32, %esp
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT: movl (%edx), %esi
-; X86-SSE2-NEXT: movl 4(%edx), %edi
-; X86-SSE2-NEXT: movl 8(%edx), %ebx
-; X86-SSE2-NEXT: movl 12(%edx), %edx
-; X86-SSE2-NEXT: movzbl (%ecx), %ecx
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %esi, (%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $15, %ecx
-; X86-SSE2-NEXT: movl (%esp,%ecx), %edx
-; X86-SSE2-NEXT: movl 4(%esp,%ecx), %esi
-; X86-SSE2-NEXT: movl 12(%esp,%ecx), %edi
-; X86-SSE2-NEXT: movl 8(%esp,%ecx), %ecx
-; X86-SSE2-NEXT: movl %ecx, 8(%eax)
-; X86-SSE2-NEXT: movl %edi, 12(%eax)
-; X86-SSE2-NEXT: movl %edx, (%eax)
-; X86-SSE2-NEXT: movl %esi, 4(%eax)
-; X86-SSE2-NEXT: addl $32, %esp
-; X86-SSE2-NEXT: popl %esi
-; X86-SSE2-NEXT: popl %edi
-; X86-SSE2-NEXT: popl %ebx
-; X86-SSE2-NEXT: retl
-;
-; X86-SSE42-LABEL: lshr_16bytes:
-; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $32, %esp
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT: movups (%edx), %xmm0
-; X86-SSE42-NEXT: movzbl (%ecx), %ecx
-; X86-SSE42-NEXT: xorps %xmm1, %xmm1
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
-; X86-SSE42-NEXT: andl $15, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $32, %esp
-; X86-SSE42-NEXT: retl
-;
-; X86-AVX-LABEL: lshr_16bytes:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: subl $32, %esp
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: vmovups (%edx), %xmm0
-; X86-AVX-NEXT: movzbl (%ecx), %ecx
-; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovups %xmm0, (%esp)
-; X86-AVX-NEXT: andl $15, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $32, %esp
-; X86-AVX-NEXT: retl
+; FALLBACK16-LABEL: lshr_16bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $60, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl 12(%ecx), %edx
+; FALLBACK16-NEXT: movl 8(%ecx), %esi
+; FALLBACK16-NEXT: movl (%ecx), %edi
+; FALLBACK16-NEXT: movl 4(%ecx), %ecx
+; FALLBACK16-NEXT: movb (%eax), %ah
+; FALLBACK16-NEXT: movb %ah, %al
+; FALLBACK16-NEXT: shlb $3, %al
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $12, %ah
+; FALLBACK16-NEXT: movzbl %ah, %ebp
+; FALLBACK16-NEXT: movl 20(%esp,%ebp), %esi
+; FALLBACK16-NEXT: movl %esi, %ebx
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl %eax, %edx
+; FALLBACK16-NEXT: notb %dl
+; FALLBACK16-NEXT: movl 24(%esp,%ebp), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %ebx, %edi
+; FALLBACK16-NEXT: movl 16(%esp,%ebp), %ebx
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: addl %esi, %esi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %ebx, %esi
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT: movl 28(%esp,%ebp), %ebx
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %ebp
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl %ebx, 12(%edx)
+; FALLBACK16-NEXT: movl %ebp, 8(%edx)
+; FALLBACK16-NEXT: movl %esi, (%edx)
+; FALLBACK16-NEXT: movl %edi, 4(%edx)
+; FALLBACK16-NEXT: addl $60, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: lshr_16bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $44, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK17-NEXT: movl 12(%edx), %esi
+; FALLBACK17-NEXT: movl (%edx), %edi
+; FALLBACK17-NEXT: movl 4(%edx), %ebx
+; FALLBACK17-NEXT: movl 8(%edx), %edx
+; FALLBACK17-NEXT: movb (%ecx), %ch
+; FALLBACK17-NEXT: movb %ch, %cl
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, (%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: andb $12, %ch
+; FALLBACK17-NEXT: movzbl %ch, %edi
+; FALLBACK17-NEXT: movl 8(%esp,%edi), %esi
+; FALLBACK17-NEXT: movl 4(%esp,%edi), %ebx
+; FALLBACK17-NEXT: movl %ebx, %edx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl (%esp,%edi), %ebp
+; FALLBACK17-NEXT: shrdl %cl, %ebx, %ebp
+; FALLBACK17-NEXT: movl 12(%esp,%edi), %edi
+; FALLBACK17-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT: shrl %cl, %edi
+; FALLBACK17-NEXT: movl %esi, 8(%eax)
+; FALLBACK17-NEXT: movl %edi, 12(%eax)
+; FALLBACK17-NEXT: movl %ebp, (%eax)
+; FALLBACK17-NEXT: movl %edx, 4(%eax)
+; FALLBACK17-NEXT: addl $44, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: lshr_16bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $44, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT: movl 12(%ecx), %edx
+; FALLBACK18-NEXT: movl 8(%ecx), %esi
+; FALLBACK18-NEXT: movl (%ecx), %edi
+; FALLBACK18-NEXT: movl 4(%ecx), %ecx
+; FALLBACK18-NEXT: movzbl (%eax), %ebx
+; FALLBACK18-NEXT: movl %ebx, %eax
+; FALLBACK18-NEXT: shlb $3, %al
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, (%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: andb $12, %bl
+; FALLBACK18-NEXT: movzbl %bl, %esi
+; FALLBACK18-NEXT: movl 4(%esp,%esi), %edi
+; FALLBACK18-NEXT: shrxl %eax, %edi, %ebx
+; FALLBACK18-NEXT: movl %eax, %edx
+; FALLBACK18-NEXT: notb %dl
+; FALLBACK18-NEXT: movl 8(%esp,%esi), %ebp
+; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT: orl %ebx, %ecx
+; FALLBACK18-NEXT: shrxl %eax, (%esp,%esi), %ebx
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK18-NEXT: orl %ebx, %edi
+; FALLBACK18-NEXT: shrxl %eax, %ebp, %ebx
+; FALLBACK18-NEXT: movl 12(%esp,%esi), %esi
+; FALLBACK18-NEXT: shrxl %eax, %esi, %eax
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %edx
+; FALLBACK18-NEXT: orl %ebx, %edx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK18-NEXT: movl %eax, 12(%esi)
+; FALLBACK18-NEXT: movl %edx, 8(%esi)
+; FALLBACK18-NEXT: movl %edi, (%esi)
+; FALLBACK18-NEXT: movl %ecx, 4(%esi)
+; FALLBACK18-NEXT: addl $44, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: lshr_16bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $44, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK19-NEXT: movl 12(%edx), %esi
+; FALLBACK19-NEXT: movl (%edx), %edi
+; FALLBACK19-NEXT: movl 4(%edx), %ebx
+; FALLBACK19-NEXT: movl 8(%edx), %edx
+; FALLBACK19-NEXT: movzbl (%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, (%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: andb $12, %al
+; FALLBACK19-NEXT: movzbl %al, %eax
+; FALLBACK19-NEXT: movl 8(%esp,%eax), %esi
+; FALLBACK19-NEXT: movl 4(%esp,%eax), %edi
+; FALLBACK19-NEXT: movl %edi, %edx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl (%esp,%eax), %ebx
+; FALLBACK19-NEXT: shrdl %cl, %edi, %ebx
+; FALLBACK19-NEXT: movl 12(%esp,%eax), %eax
+; FALLBACK19-NEXT: shrxl %ecx, %eax, %edi
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: movl %esi, 8(%ebp)
+; FALLBACK19-NEXT: movl %edi, 12(%ebp)
+; FALLBACK19-NEXT: movl %ebx, (%ebp)
+; FALLBACK19-NEXT: movl %edx, 4(%ebp)
+; FALLBACK19-NEXT: addl $44, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: lshr_16bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $60, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movzbl (%eax), %ecx
+; FALLBACK20-NEXT: movl %ecx, %eax
+; FALLBACK20-NEXT: shlb $3, %al
+; FALLBACK20-NEXT: xorps %xmm1, %xmm1
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $12, %cl
+; FALLBACK20-NEXT: movzbl %cl, %edi
+; FALLBACK20-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl 20(%esp,%edi), %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 24(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: movl 28(%esp,%edi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %esi, %ebp
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %esi, %ebx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: movl %edi, 12(%edx)
+; FALLBACK20-NEXT: movl %ebx, 4(%edx)
+; FALLBACK20-NEXT: movl %ebp, 8(%edx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, (%edx)
+; FALLBACK20-NEXT: addl $60, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: lshr_16bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $44, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT: movups (%edx), %xmm0
+; FALLBACK21-NEXT: movzbl (%ecx), %edx
+; FALLBACK21-NEXT: movl %edx, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: xorps %xmm1, %xmm1
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, (%esp)
+; FALLBACK21-NEXT: andb $12, %dl
+; FALLBACK21-NEXT: movzbl %dl, %ebx
+; FALLBACK21-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK21-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK21-NEXT: movl %ebp, %edi
+; FALLBACK21-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK21-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK21-NEXT: movl %eax, %ebx
+; FALLBACK21-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK21-NEXT: movl %edi, 8(%ebp)
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: shrl %cl, %edx
+; FALLBACK21-NEXT: movl %edx, 12(%ebp)
+; FALLBACK21-NEXT: movl %esi, (%ebp)
+; FALLBACK21-NEXT: addl $44, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: lshr_16bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $44, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: shlb $3, %al
+; FALLBACK22-NEXT: xorps %xmm1, %xmm1
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, (%esp)
+; FALLBACK22-NEXT: andb $12, %cl
+; FALLBACK22-NEXT: movzbl %cl, %edi
+; FALLBACK22-NEXT: shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK22-NEXT: movl %eax, %ecx
+; FALLBACK22-NEXT: notb %cl
+; FALLBACK22-NEXT: movl 4(%esp,%edi), %ebp
+; FALLBACK22-NEXT: movl 8(%esp,%edi), %esi
+; FALLBACK22-NEXT: leal (%ebp,%ebp), %edx
+; FALLBACK22-NEXT: shlxl %ecx, %edx, %edx
+; FALLBACK22-NEXT: orl %ebx, %edx
+; FALLBACK22-NEXT: shrxl %eax, %esi, %ebx
+; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp
+; FALLBACK22-NEXT: movl 12(%esp,%edi), %edi
+; FALLBACK22-NEXT: shrxl %eax, %edi, %eax
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK22-NEXT: orl %ebx, %edi
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ecx, %esi, %ecx
+; FALLBACK22-NEXT: orl %ebp, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK22-NEXT: movl %eax, 12(%esi)
+; FALLBACK22-NEXT: movl %ecx, 4(%esi)
+; FALLBACK22-NEXT: movl %edi, 8(%esi)
+; FALLBACK22-NEXT: movl %edx, (%esi)
+; FALLBACK22-NEXT: addl $44, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: lshr_16bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $44, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT: movups (%edx), %xmm0
+; FALLBACK23-NEXT: movzbl (%ecx), %edx
+; FALLBACK23-NEXT: movl %edx, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: xorps %xmm1, %xmm1
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, (%esp)
+; FALLBACK23-NEXT: andb $12, %dl
+; FALLBACK23-NEXT: movzbl %dl, %ebx
+; FALLBACK23-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK23-NEXT: movl %ebp, %edi
+; FALLBACK23-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK23-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, %ebx
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK23-NEXT: movl %edi, 8(%ebp)
+; FALLBACK23-NEXT: shrxl %ecx, %edx, %edx
+; FALLBACK23-NEXT: movl %edx, 12(%ebp)
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT: movl %esi, (%ebp)
+; FALLBACK23-NEXT: addl $44, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: lshr_16bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $60, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK24-NEXT: movzbl (%eax), %ecx
+; FALLBACK24-NEXT: movl %ecx, %eax
+; FALLBACK24-NEXT: shlb $3, %al
+; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $12, %cl
+; FALLBACK24-NEXT: movzbl %cl, %edi
+; FALLBACK24-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl 20(%esp,%edi), %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 24(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: movl 28(%esp,%edi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %esi, %ebp
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %esi, %ebx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: movl %edi, 12(%edx)
+; FALLBACK24-NEXT: movl %ebx, 4(%edx)
+; FALLBACK24-NEXT: movl %ebp, 8(%edx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, (%edx)
+; FALLBACK24-NEXT: addl $60, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: lshr_16bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $44, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT: vmovups (%edx), %xmm0
+; FALLBACK25-NEXT: movzbl (%ecx), %edx
+; FALLBACK25-NEXT: movl %edx, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK25-NEXT: andb $12, %dl
+; FALLBACK25-NEXT: movzbl %dl, %ebx
+; FALLBACK25-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK25-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK25-NEXT: movl %ebp, %edi
+; FALLBACK25-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK25-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK25-NEXT: movl %eax, %ebx
+; FALLBACK25-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK25-NEXT: movl %edi, 8(%ebp)
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: shrl %cl, %edx
+; FALLBACK25-NEXT: movl %edx, 12(%ebp)
+; FALLBACK25-NEXT: movl %esi, (%ebp)
+; FALLBACK25-NEXT: addl $44, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: lshr_16bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $44, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %eax
+; FALLBACK26-NEXT: shlb $3, %al
+; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK26-NEXT: andb $12, %cl
+; FALLBACK26-NEXT: movzbl %cl, %edi
+; FALLBACK26-NEXT: shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK26-NEXT: movl %eax, %ecx
+; FALLBACK26-NEXT: notb %cl
+; FALLBACK26-NEXT: movl 4(%esp,%edi), %ebp
+; FALLBACK26-NEXT: movl 8(%esp,%edi), %esi
+; FALLBACK26-NEXT: leal (%ebp,%ebp), %edx
+; FALLBACK26-NEXT: shlxl %ecx, %edx, %edx
+; FALLBACK26-NEXT: orl %ebx, %edx
+; FALLBACK26-NEXT: shrxl %eax, %esi, %ebx
+; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp
+; FALLBACK26-NEXT: movl 12(%esp,%edi), %edi
+; FALLBACK26-NEXT: shrxl %eax, %edi, %eax
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK26-NEXT: orl %ebx, %edi
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ecx, %esi, %ecx
+; FALLBACK26-NEXT: orl %ebp, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK26-NEXT: movl %eax, 12(%esi)
+; FALLBACK26-NEXT: movl %ecx, 4(%esi)
+; FALLBACK26-NEXT: movl %edi, 8(%esi)
+; FALLBACK26-NEXT: movl %edx, (%esi)
+; FALLBACK26-NEXT: addl $44, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: lshr_16bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $44, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT: vmovups (%edx), %xmm0
+; FALLBACK27-NEXT: movzbl (%ecx), %edx
+; FALLBACK27-NEXT: movl %edx, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK27-NEXT: andb $12, %dl
+; FALLBACK27-NEXT: movzbl %dl, %ebx
+; FALLBACK27-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK27-NEXT: movl %ebp, %edi
+; FALLBACK27-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK27-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, %ebx
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK27-NEXT: movl %edi, 8(%ebp)
+; FALLBACK27-NEXT: shrxl %ecx, %edx, %edx
+; FALLBACK27-NEXT: movl %edx, 12(%ebp)
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT: movl %esi, (%ebp)
+; FALLBACK27-NEXT: addl $44, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: lshr_16bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $60, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK28-NEXT: movzbl (%eax), %ecx
+; FALLBACK28-NEXT: movl %ecx, %eax
+; FALLBACK28-NEXT: shlb $3, %al
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $12, %cl
+; FALLBACK28-NEXT: movzbl %cl, %edi
+; FALLBACK28-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl 20(%esp,%edi), %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 24(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: movl 28(%esp,%edi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %esi, %ebp
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %esi, %ebx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: movl %edi, 12(%edx)
+; FALLBACK28-NEXT: movl %ebx, 4(%edx)
+; FALLBACK28-NEXT: movl %ebp, 8(%edx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, (%edx)
+; FALLBACK28-NEXT: addl $60, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: lshr_16bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $44, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT: vmovups (%edx), %xmm0
+; FALLBACK29-NEXT: movzbl (%ecx), %edx
+; FALLBACK29-NEXT: movl %edx, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK29-NEXT: andb $12, %dl
+; FALLBACK29-NEXT: movzbl %dl, %ebx
+; FALLBACK29-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK29-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK29-NEXT: movl %ebp, %edi
+; FALLBACK29-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK29-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK29-NEXT: movl %eax, %ebx
+; FALLBACK29-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK29-NEXT: movl %edi, 8(%ebp)
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: shrl %cl, %edx
+; FALLBACK29-NEXT: movl %edx, 12(%ebp)
+; FALLBACK29-NEXT: movl %esi, (%ebp)
+; FALLBACK29-NEXT: addl $44, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: lshr_16bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $44, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %eax
+; FALLBACK30-NEXT: shlb $3, %al
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK30-NEXT: andb $12, %cl
+; FALLBACK30-NEXT: movzbl %cl, %edi
+; FALLBACK30-NEXT: shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK30-NEXT: movl %eax, %ecx
+; FALLBACK30-NEXT: notb %cl
+; FALLBACK30-NEXT: movl 4(%esp,%edi), %ebp
+; FALLBACK30-NEXT: movl 8(%esp,%edi), %esi
+; FALLBACK30-NEXT: leal (%ebp,%ebp), %edx
+; FALLBACK30-NEXT: shlxl %ecx, %edx, %edx
+; FALLBACK30-NEXT: orl %ebx, %edx
+; FALLBACK30-NEXT: shrxl %eax, %esi, %ebx
+; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp
+; FALLBACK30-NEXT: movl 12(%esp,%edi), %edi
+; FALLBACK30-NEXT: shrxl %eax, %edi, %eax
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK30-NEXT: orl %ebx, %edi
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ecx, %esi, %ecx
+; FALLBACK30-NEXT: orl %ebp, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK30-NEXT: movl %eax, 12(%esi)
+; FALLBACK30-NEXT: movl %ecx, 4(%esi)
+; FALLBACK30-NEXT: movl %edi, 8(%esi)
+; FALLBACK30-NEXT: movl %edx, (%esi)
+; FALLBACK30-NEXT: addl $44, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: lshr_16bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $44, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT: vmovups (%edx), %xmm0
+; FALLBACK31-NEXT: movzbl (%ecx), %edx
+; FALLBACK31-NEXT: movl %edx, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK31-NEXT: andb $12, %dl
+; FALLBACK31-NEXT: movzbl %dl, %ebx
+; FALLBACK31-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK31-NEXT: movl %ebp, %edi
+; FALLBACK31-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK31-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, %ebx
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK31-NEXT: movl %edi, 8(%ebp)
+; FALLBACK31-NEXT: shrxl %ecx, %edx, %edx
+; FALLBACK31-NEXT: movl %edx, 12(%ebp)
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT: movl %esi, (%ebp)
+; FALLBACK31-NEXT: addl $44, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
%byteOff = load i128, ptr %byteOff.ptr, align 1
%bitOff = shl i128 %byteOff, 3
@@ -800,82 +1508,796 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
;
-; X86-SSE2-LABEL: shl_16bytes:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebx
-; X86-SSE2-NEXT: pushl %edi
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $32, %esp
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT: movl (%edx), %esi
-; X86-SSE2-NEXT: movl 4(%edx), %edi
-; X86-SSE2-NEXT: movl 8(%edx), %ebx
-; X86-SSE2-NEXT: movl 12(%edx), %edx
-; X86-SSE2-NEXT: movzbl (%ecx), %ecx
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, (%esp)
-; X86-SSE2-NEXT: andb $15, %cl
-; X86-SSE2-NEXT: negb %cl
-; X86-SSE2-NEXT: movsbl %cl, %ecx
-; X86-SSE2-NEXT: movl 16(%esp,%ecx), %edx
-; X86-SSE2-NEXT: movl 20(%esp,%ecx), %esi
-; X86-SSE2-NEXT: movl 28(%esp,%ecx), %edi
-; X86-SSE2-NEXT: movl 24(%esp,%ecx), %ecx
-; X86-SSE2-NEXT: movl %ecx, 8(%eax)
-; X86-SSE2-NEXT: movl %edi, 12(%eax)
-; X86-SSE2-NEXT: movl %edx, (%eax)
-; X86-SSE2-NEXT: movl %esi, 4(%eax)
-; X86-SSE2-NEXT: addl $32, %esp
-; X86-SSE2-NEXT: popl %esi
-; X86-SSE2-NEXT: popl %edi
-; X86-SSE2-NEXT: popl %ebx
-; X86-SSE2-NEXT: retl
-;
-; X86-SSE42-LABEL: shl_16bytes:
-; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $32, %esp
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT: movups (%edx), %xmm0
-; X86-SSE42-NEXT: movzbl (%ecx), %ecx
-; X86-SSE42-NEXT: xorps %xmm1, %xmm1
-; X86-SSE42-NEXT: movups %xmm1, (%esp)
-; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andb $15, %cl
-; X86-SSE42-NEXT: negb %cl
-; X86-SSE42-NEXT: movsbl %cl, %ecx
-; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $32, %esp
-; X86-SSE42-NEXT: retl
-;
-; X86-AVX-LABEL: shl_16bytes:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: subl $32, %esp
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: vmovups (%edx), %xmm0
-; X86-AVX-NEXT: movzbl (%ecx), %ecx
-; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT: vmovups %xmm1, (%esp)
-; X86-AVX-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andb $15, %cl
-; X86-AVX-NEXT: negb %cl
-; X86-AVX-NEXT: movsbl %cl, %ecx
-; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm0
-; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $32, %esp
-; X86-AVX-NEXT: retl
+; FALLBACK16-LABEL: shl_16bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $60, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl 8(%ecx), %ebx
+; FALLBACK16-NEXT: movl 12(%ecx), %esi
+; FALLBACK16-NEXT: movl (%ecx), %edi
+; FALLBACK16-NEXT: movl 4(%ecx), %ecx
+; FALLBACK16-NEXT: movb (%eax), %ah
+; FALLBACK16-NEXT: movb %ah, %dh
+; FALLBACK16-NEXT: shlb $3, %dh
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $12, %ah
+; FALLBACK16-NEXT: negb %ah
+; FALLBACK16-NEXT: movsbl %ah, %ebp
+; FALLBACK16-NEXT: movl 36(%esp,%ebp), %esi
+; FALLBACK16-NEXT: movl %esi, %ebx
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movb %dh, %dl
+; FALLBACK16-NEXT: notb %dl
+; FALLBACK16-NEXT: movl 32(%esp,%ebp), %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %ebx, %edi
+; FALLBACK16-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: movl 40(%esp,%ebp), %ebx
+; FALLBACK16-NEXT: movl %ebx, %ebp
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: shrl %esi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: orl %ebx, %esi
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: movl %edx, (%eax)
+; FALLBACK16-NEXT: movl %esi, 8(%eax)
+; FALLBACK16-NEXT: movl %ebp, 12(%eax)
+; FALLBACK16-NEXT: movl %edi, 4(%eax)
+; FALLBACK16-NEXT: addl $60, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: shl_16bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $44, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK17-NEXT: movl 12(%edx), %esi
+; FALLBACK17-NEXT: movl 8(%edx), %edi
+; FALLBACK17-NEXT: movl (%edx), %ebx
+; FALLBACK17-NEXT: movl 4(%edx), %edx
+; FALLBACK17-NEXT: movb (%ecx), %ch
+; FALLBACK17-NEXT: movb %ch, %cl
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, (%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: andb $12, %ch
+; FALLBACK17-NEXT: negb %ch
+; FALLBACK17-NEXT: movsbl %ch, %edi
+; FALLBACK17-NEXT: movl 16(%esp,%edi), %edx
+; FALLBACK17-NEXT: movl 20(%esp,%edi), %ebx
+; FALLBACK17-NEXT: movl %ebx, %esi
+; FALLBACK17-NEXT: shldl %cl, %edx, %esi
+; FALLBACK17-NEXT: movl 24(%esp,%edi), %ebp
+; FALLBACK17-NEXT: movl 28(%esp,%edi), %edi
+; FALLBACK17-NEXT: shldl %cl, %ebp, %edi
+; FALLBACK17-NEXT: shldl %cl, %ebx, %ebp
+; FALLBACK17-NEXT: shll %cl, %edx
+; FALLBACK17-NEXT: movl %ebp, 8(%eax)
+; FALLBACK17-NEXT: movl %edi, 12(%eax)
+; FALLBACK17-NEXT: movl %edx, (%eax)
+; FALLBACK17-NEXT: movl %esi, 4(%eax)
+; FALLBACK17-NEXT: addl $44, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: shl_16bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $44, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT: movl 8(%ecx), %edx
+; FALLBACK18-NEXT: movl 12(%ecx), %esi
+; FALLBACK18-NEXT: movl (%ecx), %edi
+; FALLBACK18-NEXT: movl 4(%ecx), %ecx
+; FALLBACK18-NEXT: movzbl (%eax), %eax
+; FALLBACK18-NEXT: movl %eax, %ebx
+; FALLBACK18-NEXT: shlb $3, %bl
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, (%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: andb $12, %al
+; FALLBACK18-NEXT: negb %al
+; FALLBACK18-NEXT: movsbl %al, %edx
+; FALLBACK18-NEXT: movl 20(%esp,%edx), %ecx
+; FALLBACK18-NEXT: shlxl %ebx, %ecx, %esi
+; FALLBACK18-NEXT: movl 16(%esp,%edx), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %ebp
+; FALLBACK18-NEXT: movl %ebx, %eax
+; FALLBACK18-NEXT: notb %al
+; FALLBACK18-NEXT: shrl %edi
+; FALLBACK18-NEXT: shrxl %eax, %edi, %edi
+; FALLBACK18-NEXT: orl %esi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, 28(%esp,%edx), %esi
+; FALLBACK18-NEXT: movl 24(%esp,%edx), %edx
+; FALLBACK18-NEXT: shlxl %ebx, %edx, %ebx
+; FALLBACK18-NEXT: shrl %edx
+; FALLBACK18-NEXT: shrxl %eax, %edx, %edx
+; FALLBACK18-NEXT: orl %esi, %edx
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %eax, %ecx, %eax
+; FALLBACK18-NEXT: orl %ebx, %eax
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT: movl %ebp, (%ecx)
+; FALLBACK18-NEXT: movl %eax, 8(%ecx)
+; FALLBACK18-NEXT: movl %edx, 12(%ecx)
+; FALLBACK18-NEXT: movl %edi, 4(%ecx)
+; FALLBACK18-NEXT: addl $44, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: shl_16bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $44, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK19-NEXT: movl 12(%edx), %esi
+; FALLBACK19-NEXT: movl 8(%edx), %edi
+; FALLBACK19-NEXT: movl (%edx), %ebx
+; FALLBACK19-NEXT: movl 4(%edx), %edx
+; FALLBACK19-NEXT: movzbl (%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, (%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: andb $12, %al
+; FALLBACK19-NEXT: negb %al
+; FALLBACK19-NEXT: movsbl %al, %eax
+; FALLBACK19-NEXT: movl 16(%esp,%eax), %edi
+; FALLBACK19-NEXT: movl 20(%esp,%eax), %esi
+; FALLBACK19-NEXT: movl %esi, %edx
+; FALLBACK19-NEXT: shldl %cl, %edi, %edx
+; FALLBACK19-NEXT: movl 24(%esp,%eax), %ebx
+; FALLBACK19-NEXT: movl 28(%esp,%eax), %eax
+; FALLBACK19-NEXT: shldl %cl, %ebx, %eax
+; FALLBACK19-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK19-NEXT: movl %ebx, 8(%ebp)
+; FALLBACK19-NEXT: movl %eax, 12(%ebp)
+; FALLBACK19-NEXT: movl %edi, (%ebp)
+; FALLBACK19-NEXT: movl %edx, 4(%ebp)
+; FALLBACK19-NEXT: addl $44, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: shl_16bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $60, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movzbl (%eax), %ecx
+; FALLBACK20-NEXT: movl %ecx, %eax
+; FALLBACK20-NEXT: shlb $3, %al
+; FALLBACK20-NEXT: xorps %xmm1, %xmm1
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $12, %cl
+; FALLBACK20-NEXT: negb %cl
+; FALLBACK20-NEXT: movsbl %cl, %edi
+; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: movl 40(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %esi
+; FALLBACK20-NEXT: shrl %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: movl 32(%esp,%edi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %edi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %ebp, %edi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: movl %eax, (%edx)
+; FALLBACK20-NEXT: movl %ebp, 4(%edx)
+; FALLBACK20-NEXT: movl %edi, 8(%edx)
+; FALLBACK20-NEXT: movl %esi, 12(%edx)
+; FALLBACK20-NEXT: addl $60, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: shl_16bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $44, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT: movups (%edx), %xmm0
+; FALLBACK21-NEXT: movzbl (%ecx), %edx
+; FALLBACK21-NEXT: movl %edx, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: xorps %xmm1, %xmm1
+; FALLBACK21-NEXT: movaps %xmm1, (%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: andb $12, %dl
+; FALLBACK21-NEXT: negb %dl
+; FALLBACK21-NEXT: movsbl %dl, %edi
+; FALLBACK21-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK21-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK21-NEXT: shldl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK21-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK21-NEXT: shldl %cl, %edi, %esi
+; FALLBACK21-NEXT: movl %ebx, %ebp
+; FALLBACK21-NEXT: shll %cl, %ebp
+; FALLBACK21-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK21-NEXT: movl %edi, 4(%eax)
+; FALLBACK21-NEXT: movl %esi, 8(%eax)
+; FALLBACK21-NEXT: movl %edx, 12(%eax)
+; FALLBACK21-NEXT: movl %ebp, (%eax)
+; FALLBACK21-NEXT: addl $44, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: shl_16bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $44, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: shlb $3, %al
+; FALLBACK22-NEXT: xorps %xmm1, %xmm1
+; FALLBACK22-NEXT: movaps %xmm1, (%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: andb $12, %cl
+; FALLBACK22-NEXT: negb %cl
+; FALLBACK22-NEXT: movsbl %cl, %ecx
+; FALLBACK22-NEXT: shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl 24(%esp,%ecx), %edx
+; FALLBACK22-NEXT: shlxl %eax, %edx, %edi
+; FALLBACK22-NEXT: movl %eax, %ebx
+; FALLBACK22-NEXT: notb %bl
+; FALLBACK22-NEXT: shrl %edx
+; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK22-NEXT: orl %esi, %edx
+; FALLBACK22-NEXT: movl 20(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl %esi, %ebp
+; FALLBACK22-NEXT: shrl %ebp
+; FALLBACK22-NEXT: shrxl %ebx, %ebp, %ebp
+; FALLBACK22-NEXT: orl %edi, %ebp
+; FALLBACK22-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK22-NEXT: movl 16(%esp,%ecx), %ecx
+; FALLBACK22-NEXT: shlxl %eax, %ecx, %eax
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %esi, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK22-NEXT: movl %eax, (%esi)
+; FALLBACK22-NEXT: movl %ecx, 4(%esi)
+; FALLBACK22-NEXT: movl %ebp, 8(%esi)
+; FALLBACK22-NEXT: movl %edx, 12(%esi)
+; FALLBACK22-NEXT: addl $44, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: shl_16bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $44, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT: movups (%edx), %xmm0
+; FALLBACK23-NEXT: movzbl (%ecx), %edx
+; FALLBACK23-NEXT: movl %edx, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: xorps %xmm1, %xmm1
+; FALLBACK23-NEXT: movaps %xmm1, (%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: andb $12, %dl
+; FALLBACK23-NEXT: negb %dl
+; FALLBACK23-NEXT: movsbl %dl, %edi
+; FALLBACK23-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK23-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK23-NEXT: shldl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK23-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK23-NEXT: shldl %cl, %edi, %esi
+; FALLBACK23-NEXT: shlxl %ecx, %ebx, %ebp
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK23-NEXT: movl %edi, 4(%eax)
+; FALLBACK23-NEXT: movl %esi, 8(%eax)
+; FALLBACK23-NEXT: movl %edx, 12(%eax)
+; FALLBACK23-NEXT: movl %ebp, (%eax)
+; FALLBACK23-NEXT: addl $44, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: shl_16bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $60, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK24-NEXT: movzbl (%eax), %ecx
+; FALLBACK24-NEXT: movl %ecx, %eax
+; FALLBACK24-NEXT: shlb $3, %al
+; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $12, %cl
+; FALLBACK24-NEXT: negb %cl
+; FALLBACK24-NEXT: movsbl %cl, %edi
+; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: movl 40(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %esi
+; FALLBACK24-NEXT: shrl %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: movl 32(%esp,%edi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %edi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %ebp, %edi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: movl %eax, (%edx)
+; FALLBACK24-NEXT: movl %ebp, 4(%edx)
+; FALLBACK24-NEXT: movl %edi, 8(%edx)
+; FALLBACK24-NEXT: movl %esi, 12(%edx)
+; FALLBACK24-NEXT: addl $60, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: shl_16bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $44, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT: vmovups (%edx), %xmm0
+; FALLBACK25-NEXT: movzbl (%ecx), %edx
+; FALLBACK25-NEXT: movl %edx, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: andb $12, %dl
+; FALLBACK25-NEXT: negb %dl
+; FALLBACK25-NEXT: movsbl %dl, %edi
+; FALLBACK25-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK25-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK25-NEXT: shldl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK25-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK25-NEXT: shldl %cl, %edi, %esi
+; FALLBACK25-NEXT: movl %ebx, %ebp
+; FALLBACK25-NEXT: shll %cl, %ebp
+; FALLBACK25-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK25-NEXT: movl %edi, 4(%eax)
+; FALLBACK25-NEXT: movl %esi, 8(%eax)
+; FALLBACK25-NEXT: movl %edx, 12(%eax)
+; FALLBACK25-NEXT: movl %ebp, (%eax)
+; FALLBACK25-NEXT: addl $44, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: shl_16bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $44, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %eax
+; FALLBACK26-NEXT: shlb $3, %al
+; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: andb $12, %cl
+; FALLBACK26-NEXT: negb %cl
+; FALLBACK26-NEXT: movsbl %cl, %ecx
+; FALLBACK26-NEXT: shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl 24(%esp,%ecx), %edx
+; FALLBACK26-NEXT: shlxl %eax, %edx, %edi
+; FALLBACK26-NEXT: movl %eax, %ebx
+; FALLBACK26-NEXT: notb %bl
+; FALLBACK26-NEXT: shrl %edx
+; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK26-NEXT: orl %esi, %edx
+; FALLBACK26-NEXT: movl 20(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl %esi, %ebp
+; FALLBACK26-NEXT: shrl %ebp
+; FALLBACK26-NEXT: shrxl %ebx, %ebp, %ebp
+; FALLBACK26-NEXT: orl %edi, %ebp
+; FALLBACK26-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK26-NEXT: movl 16(%esp,%ecx), %ecx
+; FALLBACK26-NEXT: shlxl %eax, %ecx, %eax
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %esi, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK26-NEXT: movl %eax, (%esi)
+; FALLBACK26-NEXT: movl %ecx, 4(%esi)
+; FALLBACK26-NEXT: movl %ebp, 8(%esi)
+; FALLBACK26-NEXT: movl %edx, 12(%esi)
+; FALLBACK26-NEXT: addl $44, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: shl_16bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $44, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT: vmovups (%edx), %xmm0
+; FALLBACK27-NEXT: movzbl (%ecx), %edx
+; FALLBACK27-NEXT: movl %edx, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: andb $12, %dl
+; FALLBACK27-NEXT: negb %dl
+; FALLBACK27-NEXT: movsbl %dl, %edi
+; FALLBACK27-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK27-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK27-NEXT: shldl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK27-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK27-NEXT: shldl %cl, %edi, %esi
+; FALLBACK27-NEXT: shlxl %ecx, %ebx, %ebp
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK27-NEXT: movl %edi, 4(%eax)
+; FALLBACK27-NEXT: movl %esi, 8(%eax)
+; FALLBACK27-NEXT: movl %edx, 12(%eax)
+; FALLBACK27-NEXT: movl %ebp, (%eax)
+; FALLBACK27-NEXT: addl $44, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: shl_16bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $60, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK28-NEXT: movzbl (%eax), %ecx
+; FALLBACK28-NEXT: movl %ecx, %eax
+; FALLBACK28-NEXT: shlb $3, %al
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $12, %cl
+; FALLBACK28-NEXT: negb %cl
+; FALLBACK28-NEXT: movsbl %cl, %edi
+; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: movl 40(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %esi
+; FALLBACK28-NEXT: shrl %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: movl 32(%esp,%edi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %edi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %ebp, %edi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: movl %eax, (%edx)
+; FALLBACK28-NEXT: movl %ebp, 4(%edx)
+; FALLBACK28-NEXT: movl %edi, 8(%edx)
+; FALLBACK28-NEXT: movl %esi, 12(%edx)
+; FALLBACK28-NEXT: addl $60, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: shl_16bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $44, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT: vmovups (%edx), %xmm0
+; FALLBACK29-NEXT: movzbl (%ecx), %edx
+; FALLBACK29-NEXT: movl %edx, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: andb $12, %dl
+; FALLBACK29-NEXT: negb %dl
+; FALLBACK29-NEXT: movsbl %dl, %edi
+; FALLBACK29-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK29-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK29-NEXT: shldl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK29-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK29-NEXT: shldl %cl, %edi, %esi
+; FALLBACK29-NEXT: movl %ebx, %ebp
+; FALLBACK29-NEXT: shll %cl, %ebp
+; FALLBACK29-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK29-NEXT: movl %edi, 4(%eax)
+; FALLBACK29-NEXT: movl %esi, 8(%eax)
+; FALLBACK29-NEXT: movl %edx, 12(%eax)
+; FALLBACK29-NEXT: movl %ebp, (%eax)
+; FALLBACK29-NEXT: addl $44, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: shl_16bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $44, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %eax
+; FALLBACK30-NEXT: shlb $3, %al
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: andb $12, %cl
+; FALLBACK30-NEXT: negb %cl
+; FALLBACK30-NEXT: movsbl %cl, %ecx
+; FALLBACK30-NEXT: shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK30-NEXT: movl 24(%esp,%ecx), %edx
+; FALLBACK30-NEXT: shlxl %eax, %edx, %edi
+; FALLBACK30-NEXT: movl %eax, %ebx
+; FALLBACK30-NEXT: notb %bl
+; FALLBACK30-NEXT: shrl %edx
+; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK30-NEXT: orl %esi, %edx
+; FALLBACK30-NEXT: movl 20(%esp,%ecx), %esi
+; FALLBACK30-NEXT: movl %esi, %ebp
+; FALLBACK30-NEXT: shrl %ebp
+; FALLBACK30-NEXT: shrxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT: orl %edi, %ebp
+; FALLBACK30-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK30-NEXT: movl 16(%esp,%ecx), %ecx
+; FALLBACK30-NEXT: shlxl %eax, %ecx, %eax
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %esi, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK30-NEXT: movl %eax, (%esi)
+; FALLBACK30-NEXT: movl %ecx, 4(%esi)
+; FALLBACK30-NEXT: movl %ebp, 8(%esi)
+; FALLBACK30-NEXT: movl %edx, 12(%esi)
+; FALLBACK30-NEXT: addl $44, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: shl_16bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $44, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT: vmovups (%edx), %xmm0
+; FALLBACK31-NEXT: movzbl (%ecx), %edx
+; FALLBACK31-NEXT: movl %edx, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: andb $12, %dl
+; FALLBACK31-NEXT: negb %dl
+; FALLBACK31-NEXT: movsbl %dl, %edi
+; FALLBACK31-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK31-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK31-NEXT: shldl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK31-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK31-NEXT: shldl %cl, %edi, %esi
+; FALLBACK31-NEXT: shlxl %ecx, %ebx, %ebp
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK31-NEXT: movl %edi, 4(%eax)
+; FALLBACK31-NEXT: movl %esi, 8(%eax)
+; FALLBACK31-NEXT: movl %edx, 12(%eax)
+; FALLBACK31-NEXT: movl %ebp, (%eax)
+; FALLBACK31-NEXT: addl $44, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
%byteOff = load i128, ptr %byteOff.ptr, align 1
%bitOff = shl i128 %byteOff, 3
@@ -960,107 +2382,226 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
;
-; X86-SSE2-LABEL: ashr_16bytes:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebx
-; X86-SSE2-NEXT: pushl %edi
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $32, %esp
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT: movl (%edx), %esi
-; X86-SSE2-NEXT: movl 4(%edx), %edi
-; X86-SSE2-NEXT: movl 8(%edx), %ebx
-; X86-SSE2-NEXT: movl 12(%edx), %edx
-; X86-SSE2-NEXT: movzbl (%ecx), %ecx
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %esi, (%esp)
-; X86-SSE2-NEXT: sarl $31, %edx
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $15, %ecx
-; X86-SSE2-NEXT: movl (%esp,%ecx), %edx
-; X86-SSE2-NEXT: movl 4(%esp,%ecx), %esi
-; X86-SSE2-NEXT: movl 12(%esp,%ecx), %edi
-; X86-SSE2-NEXT: movl 8(%esp,%ecx), %ecx
-; X86-SSE2-NEXT: movl %ecx, 8(%eax)
-; X86-SSE2-NEXT: movl %edi, 12(%eax)
-; X86-SSE2-NEXT: movl %edx, (%eax)
-; X86-SSE2-NEXT: movl %esi, 4(%eax)
-; X86-SSE2-NEXT: addl $32, %esp
-; X86-SSE2-NEXT: popl %esi
-; X86-SSE2-NEXT: popl %edi
-; X86-SSE2-NEXT: popl %ebx
-; X86-SSE2-NEXT: retl
-;
-; X86-SSE42-LABEL: ashr_16bytes:
-; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: pushl %ebx
-; X86-SSE42-NEXT: pushl %edi
-; X86-SSE42-NEXT: pushl %esi
-; X86-SSE42-NEXT: subl $32, %esp
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT: movl (%edx), %esi
-; X86-SSE42-NEXT: movl 4(%edx), %edi
-; X86-SSE42-NEXT: movl 8(%edx), %ebx
-; X86-SSE42-NEXT: movl 12(%edx), %edx
-; X86-SSE42-NEXT: movzbl (%ecx), %ecx
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %esi, (%esp)
-; X86-SSE42-NEXT: sarl $31, %edx
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andl $15, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $32, %esp
-; X86-SSE42-NEXT: popl %esi
-; X86-SSE42-NEXT: popl %edi
-; X86-SSE42-NEXT: popl %ebx
-; X86-SSE42-NEXT: retl
-;
-; X86-AVX-LABEL: ashr_16bytes:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: pushl %ebx
-; X86-AVX-NEXT: pushl %edi
-; X86-AVX-NEXT: pushl %esi
-; X86-AVX-NEXT: subl $32, %esp
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: movl (%edx), %esi
-; X86-AVX-NEXT: movl 4(%edx), %edi
-; X86-AVX-NEXT: movl 8(%edx), %ebx
-; X86-AVX-NEXT: movl 12(%edx), %edx
-; X86-AVX-NEXT: movzbl (%ecx), %ecx
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %esi, (%esp)
-; X86-AVX-NEXT: sarl $31, %edx
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andl $15, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $32, %esp
-; X86-AVX-NEXT: popl %esi
-; X86-AVX-NEXT: popl %edi
-; X86-AVX-NEXT: popl %ebx
-; X86-AVX-NEXT: retl
+; X86-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes:
+; X86-NO-SHLD-NO-BMI2: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 8(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 12(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: sarl $31, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: andb $12, %cl
+; X86-NO-SHLD-NO-BMI2-NEXT: movzbl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 20(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 24(%esp,%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 16(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 28(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: sarl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebx, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebp, 8(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, (%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT: addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes:
+; X86-HAVE-SHLD-NO-BMI2: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 12(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 8(%edx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, %ecx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl $31, %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: andb $12, %dl
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl %dl, %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 8(%esp,%edi), %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%esp,%edi), %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%esp,%edi), %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 12(%esp,%edi), %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edi, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebp, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
+; X86-NO-SHLD-HAVE-BMI2: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: subl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl (%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%ecx), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: sarl $31, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %esi, (%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: andb $12, %cl
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %cl, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %dl
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%ebp,%ebp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, (%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%esi), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %eax, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %esi, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 8(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 4(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: addl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
+; X86-HAVE-SHLD-HAVE-BMI2: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 12(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 8(%edx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarl $31, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: andb $12, %dl
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl %dl, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%edi), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%edi), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esp,%edi), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%edi), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebp, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
%byteOff = load i128, ptr %byteOff.ptr, align 1
%bitOff = shl i128 %byteOff, 3
@@ -1070,172 +2611,1944 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
}
define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: lshr_32bytes:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movq (%rdi), %rax
-; X64-SSE2-NEXT: movq 8(%rdi), %rcx
-; X64-SSE2-NEXT: movq 16(%rdi), %r8
-; X64-SSE2-NEXT: movq 24(%rdi), %rdi
-; X64-SSE2-NEXT: movzbl (%rsi), %esi
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $31, %esi
-; X64-SSE2-NEXT: movq -64(%rsp,%rsi), %rax
-; X64-SSE2-NEXT: movq -56(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi
-; X64-SSE2-NEXT: movq %rsi, 16(%rdx)
-; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT: movq %rax, (%rdx)
-; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
-; X64-SSE2-NEXT: retq
-;
-; X64-SSE42-LABEL: lshr_32bytes:
-; X64-SSE42: # %bb.0:
-; X64-SSE42-NEXT: movups (%rdi), %xmm0
-; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
-; X64-SSE42-NEXT: movzbl (%rsi), %eax
-; X64-SSE42-NEXT: xorps %xmm2, %xmm2
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $31, %eax
-; X64-SSE42-NEXT: movups -64(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT: movups -48(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
-; X64-SSE42-NEXT: movups %xmm0, (%rdx)
-; X64-SSE42-NEXT: retq
-;
-; X64-AVX-LABEL: lshr_32bytes:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX-NEXT: movzbl (%rsi), %eax
-; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: andl $31, %eax
-; X64-AVX-NEXT: vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX-NEXT: vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT: vzeroupper
-; X64-AVX-NEXT: retq
-;
-; X86-SSE2-LABEL: lshr_32bytes:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: pushl %ebx
-; X86-SSE2-NEXT: pushl %edi
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $72, %esp
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl (%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 4(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 8(%eax), %esi
-; X86-SSE2-NEXT: movl 12(%eax), %edi
-; X86-SSE2-NEXT: movl 16(%eax), %ebx
-; X86-SSE2-NEXT: movl 20(%eax), %ebp
-; X86-SSE2-NEXT: movl 24(%eax), %edx
-; X86-SSE2-NEXT: movl 28(%eax), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movzbl (%eax), %eax
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $31, %eax
-; X86-SSE2-NEXT: movl 8(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 12(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 20(%esp,%eax), %esi
-; X86-SSE2-NEXT: movl 16(%esp,%eax), %edi
-; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebx
-; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebp
-; X86-SSE2-NEXT: movl 36(%esp,%eax), %edx
-; X86-SSE2-NEXT: movl 32(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl %ecx, 24(%eax)
-; X86-SSE2-NEXT: movl %edx, 28(%eax)
-; X86-SSE2-NEXT: movl %ebp, 16(%eax)
-; X86-SSE2-NEXT: movl %ebx, 20(%eax)
-; X86-SSE2-NEXT: movl %edi, 8(%eax)
-; X86-SSE2-NEXT: movl %esi, 12(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, (%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $72, %esp
-; X86-SSE2-NEXT: popl %esi
-; X86-SSE2-NEXT: popl %edi
-; X86-SSE2-NEXT: popl %ebx
-; X86-SSE2-NEXT: popl %ebp
-; X86-SSE2-NEXT: retl
-;
-; X86-SSE42-LABEL: lshr_32bytes:
-; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $64, %esp
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT: movups (%edx), %xmm0
-; X86-SSE42-NEXT: movups 16(%edx), %xmm1
-; X86-SSE42-NEXT: movzbl (%ecx), %ecx
-; X86-SSE42-NEXT: xorps %xmm2, %xmm2
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
-; X86-SSE42-NEXT: andl $31, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
-; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $64, %esp
-; X86-SSE42-NEXT: retl
-;
-; X86-AVX-LABEL: lshr_32bytes:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: subl $64, %esp
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: vmovups (%edx), %ymm0
-; X86-AVX-NEXT: movzbl (%ecx), %ecx
-; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovups %ymm0, (%esp)
-; X86-AVX-NEXT: andl $31, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
-; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $64, %esp
-; X86-AVX-NEXT: vzeroupper
-; X86-AVX-NEXT: retl
+; FALLBACK0-LABEL: lshr_32bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq 16(%rdi), %rcx
+; FALLBACK0-NEXT: movq (%rdi), %r8
+; FALLBACK0-NEXT: movq 8(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: leal (,%rsi,8), %eax
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: andb $24, %sil
+; FALLBACK0-NEXT: movzbl %sil, %r9d
+; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi
+; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r8
+; FALLBACK0-NEXT: movq %rdi, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r11
+; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r9
+; FALLBACK0-NEXT: orq %r10, %r9
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: addq %rdi, %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rdi
+; FALLBACK0-NEXT: orq %r11, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: leaq (%r8,%r8), %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %rbx, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r8
+; FALLBACK0-NEXT: movq %r8, 24(%rdx)
+; FALLBACK0-NEXT: movq %r10, 16(%rdx)
+; FALLBACK0-NEXT: movq %rdi, (%rdx)
+; FALLBACK0-NEXT: movq %r9, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: lshr_32bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq 24(%rdi), %rax
+; FALLBACK1-NEXT: movq (%rdi), %r8
+; FALLBACK1-NEXT: movq 8(%rdi), %r9
+; FALLBACK1-NEXT: movq 16(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: andb $24, %sil
+; FALLBACK1-NEXT: movzbl %sil, %eax
+; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8
+; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax
+; FALLBACK1-NEXT: movq %r8, %r9
+; FALLBACK1-NEXT: shrdq %cl, %rdi, %r9
+; FALLBACK1-NEXT: shrdq %cl, %r8, %rsi
+; FALLBACK1-NEXT: shrdq %cl, %rax, %rdi
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: shrq %cl, %rax
+; FALLBACK1-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rax, 24(%rdx)
+; FALLBACK1-NEXT: movq %rsi, (%rdx)
+; FALLBACK1-NEXT: movq %r9, 8(%rdx)
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: lshr_32bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq 16(%rdi), %rcx
+; FALLBACK2-NEXT: movq (%rdi), %r8
+; FALLBACK2-NEXT: movq 8(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: leal (,%rsi,8), %eax
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: andb $24, %sil
+; FALLBACK2-NEXT: movzbl %sil, %ecx
+; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi
+; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rdi
+; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %r9
+; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %rcx
+; FALLBACK2-NEXT: shrxq %rax, %r9, %r10
+; FALLBACK2-NEXT: shrxq %rax, %rdi, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: addq %r9, %r9
+; FALLBACK2-NEXT: shlxq %rax, %r9, %r9
+; FALLBACK2-NEXT: orq %r8, %r9
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT: orq %rcx, %rsi
+; FALLBACK2-NEXT: leaq (%rdi,%rdi), %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, 24(%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rsi, (%rdx)
+; FALLBACK2-NEXT: movq %r9, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: lshr_32bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq 24(%rdi), %rax
+; FALLBACK3-NEXT: movq (%rdi), %r8
+; FALLBACK3-NEXT: movq 8(%rdi), %r9
+; FALLBACK3-NEXT: movq 16(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: andb $24, %sil
+; FALLBACK3-NEXT: movzbl %sil, %eax
+; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8
+; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax
+; FALLBACK3-NEXT: movq %r8, %r9
+; FALLBACK3-NEXT: shrdq %cl, %rdi, %r9
+; FALLBACK3-NEXT: shrdq %cl, %r8, %rsi
+; FALLBACK3-NEXT: shrdq %cl, %rax, %rdi
+; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rax, 24(%rdx)
+; FALLBACK3-NEXT: movq %rsi, (%rdx)
+; FALLBACK3-NEXT: movq %r9, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: lshr_32bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movzbl (%rsi), %ecx
+; FALLBACK4-NEXT: leal (,%rcx,8), %eax
+; FALLBACK4-NEXT: xorps %xmm2, %xmm2
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: andb $24, %cl
+; FALLBACK4-NEXT: movzbl %cl, %r9d
+; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rdi
+; FALLBACK4-NEXT: orq %r10, %rdi
+; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK4-NEXT: movq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r11, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r8, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r9
+; FALLBACK4-NEXT: movq %r9, 24(%rdx)
+; FALLBACK4-NEXT: movq %r10, 8(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT: movq %rdi, (%rdx)
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: lshr_32bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movzbl (%rsi), %eax
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: xorps %xmm2, %xmm2
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: andb $24, %al
+; FALLBACK5-NEXT: movzbl %al, %eax
+; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: movq %rdi, %r8
+; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK5-NEXT: movq %rax, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: shrq %cl, %rsi
+; FALLBACK5-NEXT: movq %r10, 8(%rdx)
+; FALLBACK5-NEXT: movq %r8, 16(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: lshr_32bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movzbl (%rsi), %ecx
+; FALLBACK6-NEXT: leal (,%rcx,8), %eax
+; FALLBACK6-NEXT: xorps %xmm2, %xmm2
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: andb $24, %cl
+; FALLBACK6-NEXT: movzbl %cl, %ecx
+; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK6-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT: orq %r9, %rcx
+; FALLBACK6-NEXT: addq %r8, %r8
+; FALLBACK6-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, (%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: lshr_32bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movzbl (%rsi), %eax
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: xorps %xmm2, %xmm2
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: andb $24, %al
+; FALLBACK7-NEXT: movzbl %al, %eax
+; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: movq %rdi, %r8
+; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK7-NEXT: movq %rax, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK7-NEXT: movq %r10, 8(%rdx)
+; FALLBACK7-NEXT: movq %r8, 16(%rdx)
+; FALLBACK7-NEXT: movq %rax, 24(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: lshr_32bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: movzbl (%rsi), %ecx
+; FALLBACK8-NEXT: leal (,%rcx,8), %eax
+; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: andb $24, %cl
+; FALLBACK8-NEXT: movzbl %cl, %r9d
+; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rdi
+; FALLBACK8-NEXT: orq %r10, %rdi
+; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r11, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r8, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, 24(%rdx)
+; FALLBACK8-NEXT: movq %r10, 8(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT: movq %rdi, (%rdx)
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: lshr_32bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: movzbl (%rsi), %eax
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: andb $24, %al
+; FALLBACK9-NEXT: movzbl %al, %eax
+; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: movq %rdi, %r8
+; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK9-NEXT: movq %rax, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: shrq %cl, %rsi
+; FALLBACK9-NEXT: movq %r10, 8(%rdx)
+; FALLBACK9-NEXT: movq %r8, 16(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: lshr_32bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: movzbl (%rsi), %ecx
+; FALLBACK10-NEXT: leal (,%rcx,8), %eax
+; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: andb $24, %cl
+; FALLBACK10-NEXT: movzbl %cl, %ecx
+; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK10-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT: orq %r9, %rcx
+; FALLBACK10-NEXT: addq %r8, %r8
+; FALLBACK10-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, (%rdx)
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: lshr_32bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: movzbl (%rsi), %eax
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: andb $24, %al
+; FALLBACK11-NEXT: movzbl %al, %eax
+; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: movq %rdi, %r8
+; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK11-NEXT: movq %rax, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK11-NEXT: movq %r10, 8(%rdx)
+; FALLBACK11-NEXT: movq %r8, 16(%rdx)
+; FALLBACK11-NEXT: movq %rax, 24(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: lshr_32bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT: movzbl (%rsi), %ecx
+; FALLBACK12-NEXT: leal (,%rcx,8), %eax
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: andb $24, %cl
+; FALLBACK12-NEXT: movzbl %cl, %r9d
+; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rdi
+; FALLBACK12-NEXT: orq %r10, %rdi
+; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r11, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r8, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, 24(%rdx)
+; FALLBACK12-NEXT: movq %r10, 8(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT: movq %rdi, (%rdx)
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: lshr_32bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT: movzbl (%rsi), %eax
+; FALLBACK13-NEXT: leal (,%rax,8), %ecx
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: andb $24, %al
+; FALLBACK13-NEXT: movzbl %al, %eax
+; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK13-NEXT: movq %rdi, %r8
+; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK13-NEXT: movq %rax, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: shrq %cl, %rsi
+; FALLBACK13-NEXT: movq %r10, 8(%rdx)
+; FALLBACK13-NEXT: movq %r8, 16(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: lshr_32bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT: movzbl (%rsi), %ecx
+; FALLBACK14-NEXT: leal (,%rcx,8), %eax
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: andb $24, %cl
+; FALLBACK14-NEXT: movzbl %cl, %ecx
+; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK14-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: addq %rcx, %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT: orq %r9, %rcx
+; FALLBACK14-NEXT: addq %r8, %r8
+; FALLBACK14-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, (%rdx)
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: lshr_32bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT: movzbl (%rsi), %eax
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: andb $24, %al
+; FALLBACK15-NEXT: movzbl %al, %eax
+; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: movq %rdi, %r8
+; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK15-NEXT: movq %rax, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK15-NEXT: movq %r10, 8(%rdx)
+; FALLBACK15-NEXT: movq %r8, 16(%rdx)
+; FALLBACK15-NEXT: movq %rax, 24(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: lshr_32bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $108, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl 12(%eax), %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 28(%eax), %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 16(%eax), %esi
+; FALLBACK16-NEXT: movl 8(%eax), %edi
+; FALLBACK16-NEXT: movl 24(%eax), %ebx
+; FALLBACK16-NEXT: movb (%ecx), %ch
+; FALLBACK16-NEXT: movl (%eax), %ebp
+; FALLBACK16-NEXT: movl 4(%eax), %edx
+; FALLBACK16-NEXT: movl 20(%eax), %eax
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movb %ch, %dl
+; FALLBACK16-NEXT: shlb $3, %dl
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $28, %ch
+; FALLBACK16-NEXT: movzbl %ch, %ebx
+; FALLBACK16-NEXT: movl 36(%esp,%ebx), %eax
+; FALLBACK16-NEXT: movl %eax, %esi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: movb %dl, %ch
+; FALLBACK16-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT: notb %ch
+; FALLBACK16-NEXT: movl 40(%esp,%ebx), %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: addl %edi, %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %esi, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%esp,%ebx), %esi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: addl %eax, %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %esi, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 44(%esp,%ebx), %ebp
+; FALLBACK16-NEXT: movl %ebp, %esi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: movl 48(%esp,%ebx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%eax,%eax), %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %esi, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: addl %ebp, %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %esi, %ebp
+; FALLBACK16-NEXT: movl %ebx, %eax
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 52(%esp,%ebx), %esi
+; FALLBACK16-NEXT: movl %esi, %ebx
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl 56(%esp,%eax), %edx
+; FALLBACK16-NEXT: leal (%edx,%edx), %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %ebx, %edi
+; FALLBACK16-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: addl %esi, %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %ebx, %esi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: movl 60(%esp,%ebx), %ebx
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %edx, %eax
+; FALLBACK16-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl %ebx, 28(%ecx)
+; FALLBACK16-NEXT: movl %eax, 24(%ecx)
+; FALLBACK16-NEXT: movl %esi, 16(%ecx)
+; FALLBACK16-NEXT: movl %edi, 20(%ecx)
+; FALLBACK16-NEXT: movl %ebp, 8(%ecx)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, 12(%ecx)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, (%ecx)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, 4(%ecx)
+; FALLBACK16-NEXT: addl $108, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: lshr_32bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $92, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl 12(%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 28(%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%ebp), %esi
+; FALLBACK17-NEXT: movl (%ebp), %edi
+; FALLBACK17-NEXT: movl 4(%ebp), %ebx
+; FALLBACK17-NEXT: movb (%ecx), %ch
+; FALLBACK17-NEXT: movl 20(%ebp), %edx
+; FALLBACK17-NEXT: movl 8(%ebp), %eax
+; FALLBACK17-NEXT: movl 24(%ebp), %ebp
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movb %ch, %cl
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: andb $28, %ch
+; FALLBACK17-NEXT: movzbl %ch, %eax
+; FALLBACK17-NEXT: movl 24(%esp,%eax), %esi
+; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%esp,%eax), %edi
+; FALLBACK17-NEXT: movl 20(%esp,%eax), %edx
+; FALLBACK17-NEXT: movl 36(%esp,%eax), %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl %edx, %ebx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%esp,%eax), %esi
+; FALLBACK17-NEXT: shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%esp,%eax), %ebx
+; FALLBACK17-NEXT: movl 28(%esp,%eax), %ebp
+; FALLBACK17-NEXT: movl 44(%esp,%eax), %edx
+; FALLBACK17-NEXT: movl %ebp, %esi
+; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi
+; FALLBACK17-NEXT: shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, %ebp
+; FALLBACK17-NEXT: shrdl %cl, %edi, %ebp
+; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl %edi, 24(%eax)
+; FALLBACK17-NEXT: shrl %cl, %edx
+; FALLBACK17-NEXT: movl %edx, 28(%eax)
+; FALLBACK17-NEXT: movl %ebx, 16(%eax)
+; FALLBACK17-NEXT: movl %ebp, 20(%eax)
+; FALLBACK17-NEXT: movl (%esp), %ecx # 4-byte Reload
+; FALLBACK17-NEXT: movl %ecx, 8(%eax)
+; FALLBACK17-NEXT: movl %esi, 12(%eax)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT: movl %ecx, (%eax)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT: movl %ecx, 4(%eax)
+; FALLBACK17-NEXT: addl $92, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: lshr_32bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $108, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl 12(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 28(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 16(%eax), %esi
+; FALLBACK18-NEXT: movl 8(%eax), %edi
+; FALLBACK18-NEXT: movl 24(%eax), %edx
+; FALLBACK18-NEXT: movzbl (%ebx), %ebx
+; FALLBACK18-NEXT: movl (%eax), %ebp
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl 20(%eax), %eax
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, %eax
+; FALLBACK18-NEXT: shlb $3, %al
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: andb $28, %bl
+; FALLBACK18-NEXT: movzbl %bl, %esi
+; FALLBACK18-NEXT: movl 36(%esp,%esi), %edi
+; FALLBACK18-NEXT: shrxl %eax, %edi, %ebx
+; FALLBACK18-NEXT: movl %eax, %edx
+; FALLBACK18-NEXT: movl %eax, %ecx
+; FALLBACK18-NEXT: notb %dl
+; FALLBACK18-NEXT: movl 40(%esp,%esi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: orl %ebx, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %ecx, 32(%esp,%esi), %ebx
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %edx, %edi, %eax
+; FALLBACK18-NEXT: orl %ebx, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 48(%esp,%esi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %ebx
+; FALLBACK18-NEXT: shlxl %edx, %ebx, %edi
+; FALLBACK18-NEXT: movl 44(%esp,%esi), %ebp
+; FALLBACK18-NEXT: movl %ecx, %eax
+; FALLBACK18-NEXT: shrxl %ecx, %ebp, %ebx
+; FALLBACK18-NEXT: orl %ebx, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %eax, %ebx
+; FALLBACK18-NEXT: addl %ebp, %ebp
+; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: orl %ecx, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 56(%esp,%esi), %ebp
+; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT: movl 52(%esp,%esi), %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %edi
+; FALLBACK18-NEXT: orl %edi, %ecx
+; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: addl %eax, %eax
+; FALLBACK18-NEXT: shlxl %edx, %eax, %edi
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT: movl 60(%esp,%esi), %esi
+; FALLBACK18-NEXT: shrxl %ebx, %esi, %ebx
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK18-NEXT: orl %eax, %esi
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl %ebx, 28(%eax)
+; FALLBACK18-NEXT: movl %esi, 24(%eax)
+; FALLBACK18-NEXT: movl %edi, 16(%eax)
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, (%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $108, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: lshr_32bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $92, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl 12(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 28(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%ecx), %esi
+; FALLBACK19-NEXT: movl (%ecx), %edi
+; FALLBACK19-NEXT: movl 4(%ecx), %ebp
+; FALLBACK19-NEXT: movzbl (%ebx), %ebx
+; FALLBACK19-NEXT: movl 20(%ecx), %edx
+; FALLBACK19-NEXT: movl 8(%ecx), %eax
+; FALLBACK19-NEXT: movl 24(%ecx), %ecx
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: andb $28, %bl
+; FALLBACK19-NEXT: movzbl %bl, %eax
+; FALLBACK19-NEXT: movl 24(%esp,%eax), %esi
+; FALLBACK19-NEXT: movl 40(%esp,%eax), %ebp
+; FALLBACK19-NEXT: movl 20(%esp,%eax), %edx
+; FALLBACK19-NEXT: movl 36(%esp,%eax), %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl %edx, %edi
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%esp,%eax), %edi
+; FALLBACK19-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%esp,%eax), %ebx
+; FALLBACK19-NEXT: movl 28(%esp,%eax), %edx
+; FALLBACK19-NEXT: movl 44(%esp,%eax), %eax
+; FALLBACK19-NEXT: movl %edx, %edi
+; FALLBACK19-NEXT: shrdl %cl, %ebx, %edi
+; FALLBACK19-NEXT: shrdl %cl, %edx, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, %esi
+; FALLBACK19-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK19-NEXT: shrxl %ecx, %eax, %edx
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl %ebp, 24(%ecx)
+; FALLBACK19-NEXT: movl %edx, 28(%ecx)
+; FALLBACK19-NEXT: movl %ebx, 16(%ecx)
+; FALLBACK19-NEXT: movl %esi, 20(%ecx)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 8(%ecx)
+; FALLBACK19-NEXT: movl %edi, 12(%ecx)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, (%ecx)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 4(%ecx)
+; FALLBACK19-NEXT: addl $92, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: lshr_32bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $108, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movzbl (%eax), %ecx
+; FALLBACK20-NEXT: movl %ecx, %eax
+; FALLBACK20-NEXT: shlb $3, %al
+; FALLBACK20-NEXT: xorps %xmm2, %xmm2
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $28, %cl
+; FALLBACK20-NEXT: movzbl %cl, %ecx
+; FALLBACK20-NEXT: movl 32(%esp,%ecx), %esi
+; FALLBACK20-NEXT: movl 36(%esp,%ecx), %ebx
+; FALLBACK20-NEXT: movl %ecx, %edi
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %esi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %esi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK20-NEXT: movl %esi, %ebx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %ebp, %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %ebx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK20-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl %ebx, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebp, %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %edi, %ebp
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl 60(%esp,%ecx), %ebx
+; FALLBACK20-NEXT: leal (%ebx,%ebx), %edi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %ebx, 28(%eax)
+; FALLBACK20-NEXT: movl %esi, 4(%eax)
+; FALLBACK20-NEXT: movl %edi, 24(%eax)
+; FALLBACK20-NEXT: movl %ebp, 16(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, (%eax)
+; FALLBACK20-NEXT: addl $108, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: lshr_32bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $108, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movups (%ecx), %xmm0
+; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT: movzbl (%eax), %eax
+; FALLBACK21-NEXT: movl %eax, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: xorps %xmm2, %xmm2
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: andb $28, %al
+; FALLBACK21-NEXT: movzbl %al, %ebp
+; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl %edi, %esi
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %esi, 4(%ebp)
+; FALLBACK21-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT: shrl %cl, %eax
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %edx, (%ebp)
+; FALLBACK21-NEXT: addl $108, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: lshr_32bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $108, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %edx
+; FALLBACK22-NEXT: shlb $3, %dl
+; FALLBACK22-NEXT: xorps %xmm2, %xmm2
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: andb $28, %cl
+; FALLBACK22-NEXT: movzbl %cl, %edi
+; FALLBACK22-NEXT: shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %edx, %eax
+; FALLBACK22-NEXT: notb %al
+; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK22-NEXT: movl %eax, %ebp
+; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK22-NEXT: shrxl %edx, %ecx, %ebx
+; FALLBACK22-NEXT: orl %ebx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK22-NEXT: movl 40(%esp,%edi), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %eax, %ebx
+; FALLBACK22-NEXT: orl %ebx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK22-NEXT: shlxl %ebp, %ebx, %eax
+; FALLBACK22-NEXT: movl %ebp, %ecx
+; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK22-NEXT: shrxl %edx, %ebx, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %ebx, %ebx
+; FALLBACK22-NEXT: shlxl %ecx, %ebx, %ebx
+; FALLBACK22-NEXT: orl %ebp, %ebx
+; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %eax
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: movl %ecx, %edx
+; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK22-NEXT: orl %ebp, %edi
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %esi, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT: movl %eax, 28(%edx)
+; FALLBACK22-NEXT: movl %ecx, 4(%edx)
+; FALLBACK22-NEXT: movl %edi, 24(%edx)
+; FALLBACK22-NEXT: movl %ebx, 16(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 20(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 8(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 12(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, (%edx)
+; FALLBACK22-NEXT: addl $108, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: lshr_32bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $108, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movups (%ecx), %xmm0
+; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT: movzbl (%eax), %eax
+; FALLBACK23-NEXT: movl %eax, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: xorps %xmm2, %xmm2
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: andb $28, %al
+; FALLBACK23-NEXT: movzbl %al, %ebx
+; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, %edi
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl %ebx, 4(%eax)
+; FALLBACK23-NEXT: movl %ebp, 24(%eax)
+; FALLBACK23-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK23-NEXT: movl %ebx, 28(%eax)
+; FALLBACK23-NEXT: movl %esi, 16(%eax)
+; FALLBACK23-NEXT: movl %edi, 20(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: movl %esi, 8(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: movl %esi, 12(%eax)
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, (%eax)
+; FALLBACK23-NEXT: addl $108, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: lshr_32bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $108, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: movzbl (%eax), %ecx
+; FALLBACK24-NEXT: movl %ecx, %eax
+; FALLBACK24-NEXT: shlb $3, %al
+; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $28, %cl
+; FALLBACK24-NEXT: movzbl %cl, %ecx
+; FALLBACK24-NEXT: movl 32(%esp,%ecx), %esi
+; FALLBACK24-NEXT: movl 36(%esp,%ecx), %ebx
+; FALLBACK24-NEXT: movl %ecx, %edi
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %esi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %esi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK24-NEXT: movl %esi, %ebx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %ebp, %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %ebx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK24-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl %ebx, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebp, %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %edi, %ebp
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl 60(%esp,%ecx), %ebx
+; FALLBACK24-NEXT: leal (%ebx,%ebx), %edi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %ebx, 28(%eax)
+; FALLBACK24-NEXT: movl %esi, 4(%eax)
+; FALLBACK24-NEXT: movl %edi, 24(%eax)
+; FALLBACK24-NEXT: movl %ebp, 16(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, (%eax)
+; FALLBACK24-NEXT: addl $108, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: lshr_32bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $108, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT: movzbl (%eax), %eax
+; FALLBACK25-NEXT: movl %eax, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: andb $28, %al
+; FALLBACK25-NEXT: movzbl %al, %ebp
+; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl %edi, %esi
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %esi, 4(%ebp)
+; FALLBACK25-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT: shrl %cl, %eax
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %edx, (%ebp)
+; FALLBACK25-NEXT: addl $108, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: lshr_32bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $108, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %edx
+; FALLBACK26-NEXT: shlb $3, %dl
+; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: andb $28, %cl
+; FALLBACK26-NEXT: movzbl %cl, %edi
+; FALLBACK26-NEXT: shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %edx, %eax
+; FALLBACK26-NEXT: notb %al
+; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK26-NEXT: orl %ecx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK26-NEXT: movl %eax, %ebp
+; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK26-NEXT: shrxl %edx, %ecx, %ebx
+; FALLBACK26-NEXT: orl %ebx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK26-NEXT: movl 40(%esp,%edi), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %eax, %ebx
+; FALLBACK26-NEXT: orl %ebx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK26-NEXT: shlxl %ebp, %ebx, %eax
+; FALLBACK26-NEXT: movl %ebp, %ecx
+; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK26-NEXT: shrxl %edx, %ebx, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %ebx, %ebx
+; FALLBACK26-NEXT: shlxl %ecx, %ebx, %ebx
+; FALLBACK26-NEXT: orl %ebp, %ebx
+; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %eax
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: movl %ecx, %edx
+; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK26-NEXT: orl %ebp, %edi
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %esi, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT: movl %eax, 28(%edx)
+; FALLBACK26-NEXT: movl %ecx, 4(%edx)
+; FALLBACK26-NEXT: movl %edi, 24(%edx)
+; FALLBACK26-NEXT: movl %ebx, 16(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 20(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 8(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 12(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, (%edx)
+; FALLBACK26-NEXT: addl $108, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: lshr_32bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $108, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT: movzbl (%eax), %eax
+; FALLBACK27-NEXT: movl %eax, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: andb $28, %al
+; FALLBACK27-NEXT: movzbl %al, %ebx
+; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, %edi
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl %ebx, 4(%eax)
+; FALLBACK27-NEXT: movl %ebp, 24(%eax)
+; FALLBACK27-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK27-NEXT: movl %ebx, 28(%eax)
+; FALLBACK27-NEXT: movl %esi, 16(%eax)
+; FALLBACK27-NEXT: movl %edi, 20(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: movl %esi, 8(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: movl %esi, 12(%eax)
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, (%eax)
+; FALLBACK27-NEXT: addl $108, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: lshr_32bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $108, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT: movzbl (%eax), %ecx
+; FALLBACK28-NEXT: movl %ecx, %eax
+; FALLBACK28-NEXT: shlb $3, %al
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $28, %cl
+; FALLBACK28-NEXT: movzbl %cl, %ecx
+; FALLBACK28-NEXT: movl 32(%esp,%ecx), %esi
+; FALLBACK28-NEXT: movl 36(%esp,%ecx), %ebx
+; FALLBACK28-NEXT: movl %ecx, %edi
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %esi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %esi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK28-NEXT: movl %esi, %ebx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %ebp, %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %ebx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK28-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl %ebx, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebp, %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %edi, %ebp
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl 60(%esp,%ecx), %ebx
+; FALLBACK28-NEXT: leal (%ebx,%ebx), %edi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %ebx, 28(%eax)
+; FALLBACK28-NEXT: movl %esi, 4(%eax)
+; FALLBACK28-NEXT: movl %edi, 24(%eax)
+; FALLBACK28-NEXT: movl %ebp, 16(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, (%eax)
+; FALLBACK28-NEXT: addl $108, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: lshr_32bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $108, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK29-NEXT: movzbl (%eax), %eax
+; FALLBACK29-NEXT: movl %eax, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: andb $28, %al
+; FALLBACK29-NEXT: movzbl %al, %ebp
+; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl %edi, %esi
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %esi, 4(%ebp)
+; FALLBACK29-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT: shrl %cl, %eax
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %edx, (%ebp)
+; FALLBACK29-NEXT: addl $108, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: lshr_32bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $108, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %edx
+; FALLBACK30-NEXT: shlb $3, %dl
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: andb $28, %cl
+; FALLBACK30-NEXT: movzbl %cl, %edi
+; FALLBACK30-NEXT: shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %edx, %eax
+; FALLBACK30-NEXT: notb %al
+; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK30-NEXT: orl %ecx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK30-NEXT: movl %eax, %ebp
+; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK30-NEXT: shrxl %edx, %ecx, %ebx
+; FALLBACK30-NEXT: orl %ebx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK30-NEXT: movl 40(%esp,%edi), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %eax, %ebx
+; FALLBACK30-NEXT: orl %ebx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK30-NEXT: shlxl %ebp, %ebx, %eax
+; FALLBACK30-NEXT: movl %ebp, %ecx
+; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK30-NEXT: shrxl %edx, %ebx, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %ebx, %ebx
+; FALLBACK30-NEXT: shlxl %ecx, %ebx, %ebx
+; FALLBACK30-NEXT: orl %ebp, %ebx
+; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %eax
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: movl %ecx, %edx
+; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK30-NEXT: orl %ebp, %edi
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %esi, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT: movl %eax, 28(%edx)
+; FALLBACK30-NEXT: movl %ecx, 4(%edx)
+; FALLBACK30-NEXT: movl %edi, 24(%edx)
+; FALLBACK30-NEXT: movl %ebx, 16(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 20(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 8(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 12(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, (%edx)
+; FALLBACK30-NEXT: addl $108, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: lshr_32bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $108, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK31-NEXT: movzbl (%eax), %eax
+; FALLBACK31-NEXT: movl %eax, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: andb $28, %al
+; FALLBACK31-NEXT: movzbl %al, %ebx
+; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, %edi
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl %ebx, 4(%eax)
+; FALLBACK31-NEXT: movl %ebp, 24(%eax)
+; FALLBACK31-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK31-NEXT: movl %ebx, 28(%eax)
+; FALLBACK31-NEXT: movl %esi, 16(%eax)
+; FALLBACK31-NEXT: movl %edi, 20(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: movl %esi, 8(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: movl %esi, 12(%eax)
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, (%eax)
+; FALLBACK31-NEXT: addl $108, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
%bitOff = shl i256 %byteOff, 3
@@ -1244,184 +4557,1973 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
ret void
}
define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: shl_32bytes:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movq (%rdi), %rax
-; X64-SSE2-NEXT: movq 8(%rdi), %rcx
-; X64-SSE2-NEXT: movq 16(%rdi), %r8
-; X64-SSE2-NEXT: movq 24(%rdi), %rdi
-; X64-SSE2-NEXT: movzbl (%rsi), %esi
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andb $31, %sil
-; X64-SSE2-NEXT: negb %sil
-; X64-SSE2-NEXT: movsbq %sil, %rax
-; X64-SSE2-NEXT: movq -32(%rsp,%rax), %rcx
-; X64-SSE2-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-SSE2-NEXT: movq -8(%rsp,%rax), %rdi
-; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rax
-; X64-SSE2-NEXT: movq %rax, 16(%rdx)
-; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT: movq %rcx, (%rdx)
-; X64-SSE2-NEXT: movq %rsi, 8(%rdx)
-; X64-SSE2-NEXT: retq
-;
-; X64-SSE42-LABEL: shl_32bytes:
-; X64-SSE42: # %bb.0:
-; X64-SSE42-NEXT: movups (%rdi), %xmm0
-; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
-; X64-SSE42-NEXT: movzbl (%rsi), %eax
-; X64-SSE42-NEXT: xorps %xmm2, %xmm2
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andb $31, %al
-; X64-SSE42-NEXT: negb %al
-; X64-SSE42-NEXT: movsbq %al, %rax
-; X64-SSE42-NEXT: movups -32(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT: movups -16(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
-; X64-SSE42-NEXT: movups %xmm0, (%rdx)
-; X64-SSE42-NEXT: retq
-;
-; X64-AVX-LABEL: shl_32bytes:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX-NEXT: movzbl (%rsi), %eax
-; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: andb $31, %al
-; X64-AVX-NEXT: negb %al
-; X64-AVX-NEXT: movsbq %al, %rax
-; X64-AVX-NEXT: vmovups -32(%rsp,%rax), %xmm0
-; X64-AVX-NEXT: vmovups -16(%rsp,%rax), %xmm1
-; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT: vzeroupper
-; X64-AVX-NEXT: retq
-;
-; X86-SSE2-LABEL: shl_32bytes:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: pushl %ebx
-; X86-SSE2-NEXT: pushl %edi
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $72, %esp
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT: movl (%edi), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 4(%edi), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 8(%edi), %esi
-; X86-SSE2-NEXT: movl 12(%edi), %ebx
-; X86-SSE2-NEXT: movl 16(%edi), %ebp
-; X86-SSE2-NEXT: movzbl (%eax), %eax
-; X86-SSE2-NEXT: movl 20(%edi), %edx
-; X86-SSE2-NEXT: movl 24(%edi), %ecx
-; X86-SSE2-NEXT: movl 28(%edi), %edi
-; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andb $31, %al
-; X86-SSE2-NEXT: negb %al
-; X86-SSE2-NEXT: movsbl %al, %edx
-; X86-SSE2-NEXT: movl 40(%esp,%edx), %eax
-; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 44(%esp,%edx), %eax
-; X86-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 52(%esp,%edx), %esi
-; X86-SSE2-NEXT: movl 48(%esp,%edx), %edi
-; X86-SSE2-NEXT: movl 60(%esp,%edx), %ebx
-; X86-SSE2-NEXT: movl 56(%esp,%edx), %ebp
-; X86-SSE2-NEXT: movl 68(%esp,%edx), %ecx
-; X86-SSE2-NEXT: movl 64(%esp,%edx), %edx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl %edx, 24(%eax)
-; X86-SSE2-NEXT: movl %ecx, 28(%eax)
-; X86-SSE2-NEXT: movl %ebp, 16(%eax)
-; X86-SSE2-NEXT: movl %ebx, 20(%eax)
-; X86-SSE2-NEXT: movl %edi, 8(%eax)
-; X86-SSE2-NEXT: movl %esi, 12(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, (%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $72, %esp
-; X86-SSE2-NEXT: popl %esi
-; X86-SSE2-NEXT: popl %edi
-; X86-SSE2-NEXT: popl %ebx
-; X86-SSE2-NEXT: popl %ebp
-; X86-SSE2-NEXT: retl
-;
-; X86-SSE42-LABEL: shl_32bytes:
-; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $64, %esp
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT: movups (%edx), %xmm0
-; X86-SSE42-NEXT: movups 16(%edx), %xmm1
-; X86-SSE42-NEXT: movzbl (%ecx), %ecx
-; X86-SSE42-NEXT: xorps %xmm2, %xmm2
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, (%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andb $31, %cl
-; X86-SSE42-NEXT: negb %cl
-; X86-SSE42-NEXT: movsbl %cl, %ecx
-; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
-; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $64, %esp
-; X86-SSE42-NEXT: retl
-;
-; X86-AVX-LABEL: shl_32bytes:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: subl $64, %esp
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: vmovups (%edx), %ymm0
-; X86-AVX-NEXT: movzbl (%ecx), %ecx
-; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT: vmovups %ymm1, (%esp)
-; X86-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andb $31, %cl
-; X86-AVX-NEXT: negb %cl
-; X86-AVX-NEXT: movsbl %cl, %ecx
-; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm0
-; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm1
-; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
-; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $64, %esp
-; X86-AVX-NEXT: vzeroupper
-; X86-AVX-NEXT: retl
+; FALLBACK0-LABEL: shl_32bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq 16(%rdi), %rcx
+; FALLBACK0-NEXT: movq (%rdi), %r8
+; FALLBACK0-NEXT: movq 8(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: leal (,%rsi,8), %eax
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: andb $24, %sil
+; FALLBACK0-NEXT: negb %sil
+; FALLBACK0-NEXT: movsbq %sil, %r8
+; FALLBACK0-NEXT: movq -24(%rsp,%r8), %rdi
+; FALLBACK0-NEXT: movq -8(%rsp,%r8), %r10
+; FALLBACK0-NEXT: movq %rdi, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -32(%rsp,%r8), %r9
+; FALLBACK0-NEXT: movq -16(%rsp,%r8), %rbx
+; FALLBACK0-NEXT: movq %r9, %r8
+; FALLBACK0-NEXT: shrq %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r8
+; FALLBACK0-NEXT: orq %r11, %r8
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: movq %rbx, %r11
+; FALLBACK0-NEXT: shrq %r11
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: orq %r10, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rbx
+; FALLBACK0-NEXT: shrq %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rdi
+; FALLBACK0-NEXT: orq %rbx, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r9
+; FALLBACK0-NEXT: movq %r9, (%rdx)
+; FALLBACK0-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK0-NEXT: movq %r11, 24(%rdx)
+; FALLBACK0-NEXT: movq %r8, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: shl_32bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq 24(%rdi), %rax
+; FALLBACK1-NEXT: movq (%rdi), %r8
+; FALLBACK1-NEXT: movq 8(%rdi), %r9
+; FALLBACK1-NEXT: movq 16(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: andb $24, %sil
+; FALLBACK1-NEXT: negb %sil
+; FALLBACK1-NEXT: movsbq %sil, %rax
+; FALLBACK1-NEXT: movq -40(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: movq -32(%rsp,%rax), %r8
+; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rax
+; FALLBACK1-NEXT: movq %r8, %r9
+; FALLBACK1-NEXT: shldq %cl, %rsi, %r9
+; FALLBACK1-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK1-NEXT: shldq %cl, %r8, %rdi
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: shlq %cl, %rsi
+; FALLBACK1-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rax, 24(%rdx)
+; FALLBACK1-NEXT: movq %rsi, (%rdx)
+; FALLBACK1-NEXT: movq %r9, 8(%rdx)
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: shl_32bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq 16(%rdi), %rcx
+; FALLBACK2-NEXT: movq (%rdi), %r8
+; FALLBACK2-NEXT: movq 8(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: leal (,%rsi,8), %eax
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: andb $24, %sil
+; FALLBACK2-NEXT: negb %sil
+; FALLBACK2-NEXT: movsbq %sil, %rsi
+; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %rdi
+; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %r8
+; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r9
+; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT: shlxq %rax, %r9, %r10
+; FALLBACK2-NEXT: shlxq %rax, %r8, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: shrq %r8
+; FALLBACK2-NEXT: shrxq %rax, %r8, %r8
+; FALLBACK2-NEXT: orq %rdi, %r8
+; FALLBACK2-NEXT: shrq %r9
+; FALLBACK2-NEXT: shrxq %rax, %r9, %rdi
+; FALLBACK2-NEXT: orq %rsi, %rdi
+; FALLBACK2-NEXT: shrq %rcx
+; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, (%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK2-NEXT: movq %r8, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: shl_32bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq 24(%rdi), %rax
+; FALLBACK3-NEXT: movq (%rdi), %r8
+; FALLBACK3-NEXT: movq 8(%rdi), %r9
+; FALLBACK3-NEXT: movq 16(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: andb $24, %sil
+; FALLBACK3-NEXT: negb %sil
+; FALLBACK3-NEXT: movsbq %sil, %rax
+; FALLBACK3-NEXT: movq -40(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: movq -32(%rsp,%rax), %r8
+; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rax
+; FALLBACK3-NEXT: movq %r8, %r9
+; FALLBACK3-NEXT: shldq %cl, %rsi, %r9
+; FALLBACK3-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK3-NEXT: shldq %cl, %r8, %rdi
+; FALLBACK3-NEXT: shlxq %rcx, %rsi, %rcx
+; FALLBACK3-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rax, 24(%rdx)
+; FALLBACK3-NEXT: movq %rcx, (%rdx)
+; FALLBACK3-NEXT: movq %r9, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: shl_32bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movzbl (%rsi), %ecx
+; FALLBACK4-NEXT: leal (,%rcx,8), %eax
+; FALLBACK4-NEXT: xorps %xmm2, %xmm2
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: andb $24, %cl
+; FALLBACK4-NEXT: negb %cl
+; FALLBACK4-NEXT: movsbq %cl, %r8
+; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r9
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK4-NEXT: movq %r10, %rdi
+; FALLBACK4-NEXT: shrq %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rdi
+; FALLBACK4-NEXT: orq %r9, %rdi
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK4-NEXT: movq %r8, %r11
+; FALLBACK4-NEXT: shrq %r11
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: orq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r8
+; FALLBACK4-NEXT: movq %r9, %r10
+; FALLBACK4-NEXT: shrq %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: orq %r8, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r9
+; FALLBACK4-NEXT: movq %r9, (%rdx)
+; FALLBACK4-NEXT: movq %r10, 8(%rdx)
+; FALLBACK4-NEXT: movq %r11, 16(%rdx)
+; FALLBACK4-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: shl_32bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movzbl (%rsi), %eax
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: xorps %xmm2, %xmm2
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: andb $24, %al
+; FALLBACK5-NEXT: negb %al
+; FALLBACK5-NEXT: movsbq %al, %rax
+; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK5-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK5-NEXT: movq %r8, %r9
+; FALLBACK5-NEXT: shlq %cl, %r9
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: shldq %cl, %r8, %rax
+; FALLBACK5-NEXT: movq %rax, 8(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK5-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: shl_32bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movzbl (%rsi), %ecx
+; FALLBACK6-NEXT: leal (,%rcx,8), %eax
+; FALLBACK6-NEXT: xorps %xmm2, %xmm2
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: andb $24, %cl
+; FALLBACK6-NEXT: negb %cl
+; FALLBACK6-NEXT: movsbq %cl, %rcx
+; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK6-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: shrq %rdi
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: shrq %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT: orq %r8, %rcx
+; FALLBACK6-NEXT: shrq %r9
+; FALLBACK6-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, (%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: shl_32bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movzbl (%rsi), %eax
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: xorps %xmm2, %xmm2
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: andb $24, %al
+; FALLBACK7-NEXT: negb %al
+; FALLBACK7-NEXT: movsbq %al, %rax
+; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK7-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT: shldq %cl, %r8, %rax
+; FALLBACK7-NEXT: movq %rax, 8(%rdx)
+; FALLBACK7-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK7-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: shl_32bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: movzbl (%rsi), %ecx
+; FALLBACK8-NEXT: leal (,%rcx,8), %eax
+; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: andb $24, %cl
+; FALLBACK8-NEXT: negb %cl
+; FALLBACK8-NEXT: movsbq %cl, %r8
+; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r9
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK8-NEXT: movq %r10, %rdi
+; FALLBACK8-NEXT: shrq %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rdi
+; FALLBACK8-NEXT: orq %r9, %rdi
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK8-NEXT: movq %r8, %r11
+; FALLBACK8-NEXT: shrq %r11
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: orq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r8
+; FALLBACK8-NEXT: movq %r9, %r10
+; FALLBACK8-NEXT: shrq %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: orq %r8, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, (%rdx)
+; FALLBACK8-NEXT: movq %r10, 8(%rdx)
+; FALLBACK8-NEXT: movq %r11, 16(%rdx)
+; FALLBACK8-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: shl_32bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: movzbl (%rsi), %eax
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: andb $24, %al
+; FALLBACK9-NEXT: negb %al
+; FALLBACK9-NEXT: movsbq %al, %rax
+; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK9-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK9-NEXT: movq %r8, %r9
+; FALLBACK9-NEXT: shlq %cl, %r9
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: shldq %cl, %r8, %rax
+; FALLBACK9-NEXT: movq %rax, 8(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: shl_32bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: movzbl (%rsi), %ecx
+; FALLBACK10-NEXT: leal (,%rcx,8), %eax
+; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: andb $24, %cl
+; FALLBACK10-NEXT: negb %cl
+; FALLBACK10-NEXT: movsbq %cl, %rcx
+; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK10-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: shrq %rdi
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: shrq %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT: orq %r8, %rcx
+; FALLBACK10-NEXT: shrq %r9
+; FALLBACK10-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, (%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: shl_32bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: movzbl (%rsi), %eax
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: andb $24, %al
+; FALLBACK11-NEXT: negb %al
+; FALLBACK11-NEXT: movsbq %al, %rax
+; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK11-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT: shldq %cl, %r8, %rax
+; FALLBACK11-NEXT: movq %rax, 8(%rdx)
+; FALLBACK11-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK11-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: shl_32bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT: movzbl (%rsi), %ecx
+; FALLBACK12-NEXT: leal (,%rcx,8), %eax
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: andb $24, %cl
+; FALLBACK12-NEXT: negb %cl
+; FALLBACK12-NEXT: movsbq %cl, %r8
+; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r9
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK12-NEXT: movq %r10, %rdi
+; FALLBACK12-NEXT: shrq %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rdi
+; FALLBACK12-NEXT: orq %r9, %rdi
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK12-NEXT: movq %r8, %r11
+; FALLBACK12-NEXT: shrq %r11
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: orq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r8
+; FALLBACK12-NEXT: movq %r9, %r10
+; FALLBACK12-NEXT: shrq %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: orq %r8, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, (%rdx)
+; FALLBACK12-NEXT: movq %r10, 8(%rdx)
+; FALLBACK12-NEXT: movq %r11, 16(%rdx)
+; FALLBACK12-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: shl_32bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT: movzbl (%rsi), %eax
+; FALLBACK13-NEXT: leal (,%rax,8), %ecx
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: andb $24, %al
+; FALLBACK13-NEXT: negb %al
+; FALLBACK13-NEXT: movsbq %al, %rax
+; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK13-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK13-NEXT: movq %r8, %r9
+; FALLBACK13-NEXT: shlq %cl, %r9
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: shldq %cl, %r8, %rax
+; FALLBACK13-NEXT: movq %rax, 8(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK13-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: shl_32bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT: movzbl (%rsi), %ecx
+; FALLBACK14-NEXT: leal (,%rcx,8), %eax
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: andb $24, %cl
+; FALLBACK14-NEXT: negb %cl
+; FALLBACK14-NEXT: movsbq %cl, %rcx
+; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK14-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: shrq %rdi
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: shrq %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT: orq %r8, %rcx
+; FALLBACK14-NEXT: shrq %r9
+; FALLBACK14-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, (%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: shl_32bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT: movzbl (%rsi), %eax
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: andb $24, %al
+; FALLBACK15-NEXT: negb %al
+; FALLBACK15-NEXT: movsbq %al, %rax
+; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK15-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT: shldq %cl, %r8, %rax
+; FALLBACK15-NEXT: movq %rax, 8(%rdx)
+; FALLBACK15-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK15-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: shl_32bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $108, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl 8(%ecx), %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 24(%ecx), %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%ecx), %esi
+; FALLBACK16-NEXT: movl 28(%ecx), %edi
+; FALLBACK16-NEXT: movl 16(%ecx), %ebx
+; FALLBACK16-NEXT: movb (%eax), %ah
+; FALLBACK16-NEXT: movl (%ecx), %ebp
+; FALLBACK16-NEXT: movl 4(%ecx), %edx
+; FALLBACK16-NEXT: movl 20(%ecx), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movb %ah, %ch
+; FALLBACK16-NEXT: shlb $3, %ch
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $28, %ah
+; FALLBACK16-NEXT: negb %ah
+; FALLBACK16-NEXT: movsbl %ah, %edi
+; FALLBACK16-NEXT: movl 68(%esp,%edi), %eax
+; FALLBACK16-NEXT: movl %eax, %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movb %ch, %dl
+; FALLBACK16-NEXT: notb %dl
+; FALLBACK16-NEXT: movl 64(%esp,%edi), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: shrl %ebx
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: orl %esi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 76(%esp,%edi), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 72(%esp,%edi), %esi
+; FALLBACK16-NEXT: movl %esi, %ebp
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %ebx, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: shrl %eax
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: orl %esi, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 84(%esp,%edi), %esi
+; FALLBACK16-NEXT: movl %esi, %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: movl 80(%esp,%edi), %ebp
+; FALLBACK16-NEXT: movl %ebp, %ebx
+; FALLBACK16-NEXT: shrl %ebx
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: orl %eax, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %eax
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: orl %ebp, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 92(%esp,%edi), %eax
+; FALLBACK16-NEXT: movl 88(%esp,%edi), %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: movl %ebp, %edi
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %eax, %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: shrl %esi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: orl %ebp, %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl %edx, (%eax)
+; FALLBACK16-NEXT: movl %esi, 24(%eax)
+; FALLBACK16-NEXT: movl %edi, 28(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 16(%eax)
+; FALLBACK16-NEXT: movl %ebx, 20(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 8(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 12(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 4(%eax)
+; FALLBACK16-NEXT: addl $108, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: shl_32bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $108, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl 12(%eax), %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 28(%eax), %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%eax), %esi
+; FALLBACK17-NEXT: movl 24(%eax), %edi
+; FALLBACK17-NEXT: movl 20(%eax), %ebx
+; FALLBACK17-NEXT: movb (%ecx), %ch
+; FALLBACK17-NEXT: movl (%eax), %edx
+; FALLBACK17-NEXT: movl 4(%eax), %ebp
+; FALLBACK17-NEXT: movl 16(%eax), %eax
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movb %ch, %cl
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: andb $28, %ch
+; FALLBACK17-NEXT: negb %ch
+; FALLBACK17-NEXT: movsbl %ch, %ebx
+; FALLBACK17-NEXT: movl 64(%esp,%ebx), %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 80(%esp,%ebx), %ebp
+; FALLBACK17-NEXT: movl 68(%esp,%ebx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 84(%esp,%ebx), %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl %eax, %esi
+; FALLBACK17-NEXT: shldl %cl, %edx, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 72(%esp,%ebx), %edi
+; FALLBACK17-NEXT: movl 88(%esp,%ebx), %edx
+; FALLBACK17-NEXT: movl 76(%esp,%ebx), %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 92(%esp,%ebx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edi, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: shldl %cl, %eax, %edi
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, %esi
+; FALLBACK17-NEXT: shldl %cl, %ebp, %esi
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK17-NEXT: shldl %cl, %ebx, %ebp
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK17-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK17-NEXT: shldl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl %edx, 24(%eax)
+; FALLBACK17-NEXT: movl %ebx, 28(%eax)
+; FALLBACK17-NEXT: movl %ebp, 16(%eax)
+; FALLBACK17-NEXT: movl %esi, 20(%eax)
+; FALLBACK17-NEXT: movl %edi, 8(%eax)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, 12(%eax)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shll %cl, %edx
+; FALLBACK17-NEXT: movl %edx, (%eax)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT: movl %ecx, 4(%eax)
+; FALLBACK17-NEXT: addl $108, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: shl_32bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $108, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl 8(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 24(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 12(%eax), %esi
+; FALLBACK18-NEXT: movl 28(%eax), %edi
+; FALLBACK18-NEXT: movl 16(%eax), %edx
+; FALLBACK18-NEXT: movzbl (%ebx), %ebx
+; FALLBACK18-NEXT: movl (%eax), %ebp
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl 20(%eax), %eax
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, %eax
+; FALLBACK18-NEXT: shlb $3, %al
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: andb $28, %bl
+; FALLBACK18-NEXT: negb %bl
+; FALLBACK18-NEXT: movsbl %bl, %esi
+; FALLBACK18-NEXT: movl 68(%esp,%esi), %ecx
+; FALLBACK18-NEXT: shlxl %eax, %ecx, %edi
+; FALLBACK18-NEXT: movl %eax, %edx
+; FALLBACK18-NEXT: notb %dl
+; FALLBACK18-NEXT: movl 64(%esp,%esi), %ebx
+; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %ebx
+; FALLBACK18-NEXT: shrxl %edx, %ebx, %ebx
+; FALLBACK18-NEXT: orl %edi, %ebx
+; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 72(%esp,%esi), %edi
+; FALLBACK18-NEXT: shlxl %eax, %edi, %ebx
+; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl %eax, %ebx
+; FALLBACK18-NEXT: shrl %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %eax
+; FALLBACK18-NEXT: movl 76(%esp,%esi), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %edx, %ecx, %eax
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 80(%esp,%esi), %ecx
+; FALLBACK18-NEXT: movl %ebx, %eax
+; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ebx
+; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %edx, %ecx, %ebx
+; FALLBACK18-NEXT: movl 84(%esp,%esi), %ecx
+; FALLBACK18-NEXT: shlxl %eax, %ecx, %ebp
+; FALLBACK18-NEXT: orl %ebp, %ebx
+; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %edi
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK18-NEXT: shlxl %eax, 92(%esp,%esi), %ebx
+; FALLBACK18-NEXT: movl 88(%esp,%esi), %esi
+; FALLBACK18-NEXT: shlxl %eax, %esi, %eax
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT: orl %ebx, %esi
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT: orl %eax, %ecx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl %ebp, (%eax)
+; FALLBACK18-NEXT: movl %ecx, 24(%eax)
+; FALLBACK18-NEXT: movl %esi, 28(%eax)
+; FALLBACK18-NEXT: movl %edi, 16(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $108, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: shl_32bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $108, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: movl 12(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 28(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%eax), %esi
+; FALLBACK19-NEXT: movl 24(%eax), %edi
+; FALLBACK19-NEXT: movl 20(%eax), %edx
+; FALLBACK19-NEXT: movzbl (%ebx), %ebx
+; FALLBACK19-NEXT: movl (%eax), %ecx
+; FALLBACK19-NEXT: movl 4(%eax), %ebp
+; FALLBACK19-NEXT: movl 16(%eax), %eax
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: andb $28, %bl
+; FALLBACK19-NEXT: negb %bl
+; FALLBACK19-NEXT: movsbl %bl, %ebx
+; FALLBACK19-NEXT: movl 64(%esp,%ebx), %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 80(%esp,%ebx), %esi
+; FALLBACK19-NEXT: movl 68(%esp,%ebx), %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 84(%esp,%ebx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl %edx, %eax
+; FALLBACK19-NEXT: shldl %cl, %edi, %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 72(%esp,%ebx), %edi
+; FALLBACK19-NEXT: movl 88(%esp,%ebx), %edx
+; FALLBACK19-NEXT: movl 76(%esp,%ebx), %ebp
+; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 92(%esp,%ebx), %ebx
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: shldl %cl, %eax, %edi
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: shldl %cl, %esi, %eax
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK19-NEXT: shldl %cl, %ebp, %esi
+; FALLBACK19-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK19-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK19-NEXT: shldl %cl, %ebp, %edx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl %edx, 24(%ecx)
+; FALLBACK19-NEXT: movl %ebx, 28(%ecx)
+; FALLBACK19-NEXT: movl %esi, 16(%ecx)
+; FALLBACK19-NEXT: movl %eax, 20(%ecx)
+; FALLBACK19-NEXT: movl %edi, 8(%ecx)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 12(%ecx)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, (%ecx)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 4(%ecx)
+; FALLBACK19-NEXT: addl $108, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: shl_32bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $108, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movzbl (%eax), %ecx
+; FALLBACK20-NEXT: movb %cl, %dh
+; FALLBACK20-NEXT: shlb $3, %dh
+; FALLBACK20-NEXT: xorps %xmm2, %xmm2
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $28, %cl
+; FALLBACK20-NEXT: negb %cl
+; FALLBACK20-NEXT: movsbl %cl, %ebp
+; FALLBACK20-NEXT: movl 80(%esp,%ebp), %eax
+; FALLBACK20-NEXT: movl %eax, %edi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: movb %dh, %dl
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: movl 76(%esp,%ebp), %esi
+; FALLBACK20-NEXT: movl %ebp, %ebx
+; FALLBACK20-NEXT: movl %esi, %ebp
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %edi, %ebp
+; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 84(%esp,%ebx), %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: shrl %eax
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: orl %edi, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl %ebx, %eax
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 72(%esp,%ebx), %ebx
+; FALLBACK20-NEXT: movl %ebx, %edi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %esi, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 68(%esp,%eax), %ebp
+; FALLBACK20-NEXT: movl %ebp, %esi
+; FALLBACK20-NEXT: shrl %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: movl 64(%esp,%eax), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: shrl %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl 88(%esp,%eax), %ebp
+; FALLBACK20-NEXT: movl %ebp, %edi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shrl %eax
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: orl %edi, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl 92(%esp,%eax), %edi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %edi, %ebp
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %edx, (%eax)
+; FALLBACK20-NEXT: movl %ebp, 28(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 24(%eax)
+; FALLBACK20-NEXT: movl %ebx, 4(%eax)
+; FALLBACK20-NEXT: movl %esi, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 16(%eax)
+; FALLBACK20-NEXT: addl $108, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: shl_32bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $92, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movups (%ecx), %xmm0
+; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT: movzbl (%eax), %eax
+; FALLBACK21-NEXT: movl %eax, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: xorps %xmm2, %xmm2
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: andb $28, %al
+; FALLBACK21-NEXT: negb %al
+; FALLBACK21-NEXT: movsbl %al, %ebp
+; FALLBACK21-NEXT: movl 60(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 64(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %esi
+; FALLBACK21-NEXT: shldl %cl, %edx, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 68(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl %esi, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edi
+; FALLBACK21-NEXT: shldl %cl, %edi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 52(%esp,%ebp), %ebx
+; FALLBACK21-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl %esi, %edx
+; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT: shldl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl 48(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK21-NEXT: shldl %cl, %esi, %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl %edx, 24(%ebp)
+; FALLBACK21-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK21-NEXT: movl %edx, %eax
+; FALLBACK21-NEXT: shll %cl, %eax
+; FALLBACK21-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK21-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK21-NEXT: movl %edi, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT: movl %ecx, 12(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT: movl %ecx, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT: movl %ecx, 16(%ebp)
+; FALLBACK21-NEXT: movl %eax, (%ebp)
+; FALLBACK21-NEXT: addl $92, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: shl_32bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $92, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: shlb $3, %al
+; FALLBACK22-NEXT: xorps %xmm2, %xmm2
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: andb $28, %cl
+; FALLBACK22-NEXT: negb %cl
+; FALLBACK22-NEXT: movsbl %cl, %esi
+; FALLBACK22-NEXT: movl 64(%esp,%esi), %ecx
+; FALLBACK22-NEXT: shlxl %eax, %ecx, %edi
+; FALLBACK22-NEXT: movl %eax, %edx
+; FALLBACK22-NEXT: notb %dl
+; FALLBACK22-NEXT: movl 60(%esp,%esi), %ebp
+; FALLBACK22-NEXT: movl %ebp, %ebx
+; FALLBACK22-NEXT: shrl %ebx
+; FALLBACK22-NEXT: shrxl %edx, %ebx, %ebx
+; FALLBACK22-NEXT: orl %edi, %ebx
+; FALLBACK22-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 68(%esp,%esi), %ebx
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %edx, %ecx, %edi
+; FALLBACK22-NEXT: shlxl %eax, %ebx, %ecx
+; FALLBACK22-NEXT: orl %ecx, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %eax, %ebp, %ecx
+; FALLBACK22-NEXT: movl 56(%esp,%esi), %edi
+; FALLBACK22-NEXT: movl %edi, %ebp
+; FALLBACK22-NEXT: shrl %ebp
+; FALLBACK22-NEXT: shrxl %edx, %ebp, %ebp
+; FALLBACK22-NEXT: orl %ecx, %ebp
+; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %eax, %edi, %ecx
+; FALLBACK22-NEXT: movl 52(%esp,%esi), %edi
+; FALLBACK22-NEXT: movl %edi, %ebp
+; FALLBACK22-NEXT: shrl %ebp
+; FALLBACK22-NEXT: shrxl %edx, %ebp, %ebp
+; FALLBACK22-NEXT: orl %ecx, %ebp
+; FALLBACK22-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %eax, %edi, %ebp
+; FALLBACK22-NEXT: movl 48(%esp,%esi), %edi
+; FALLBACK22-NEXT: movl %edi, %ecx
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %edx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %ebp, %ecx
+; FALLBACK22-NEXT: shlxl %eax, %edi, %edi
+; FALLBACK22-NEXT: shlxl %eax, 76(%esp,%esi), %ebp
+; FALLBACK22-NEXT: movl 72(%esp,%esi), %esi
+; FALLBACK22-NEXT: shlxl %eax, %esi, %eax
+; FALLBACK22-NEXT: shrl %ebx
+; FALLBACK22-NEXT: shrxl %edx, %ebx, %ebx
+; FALLBACK22-NEXT: orl %eax, %ebx
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT: movl %edi, (%edx)
+; FALLBACK22-NEXT: movl %eax, 28(%edx)
+; FALLBACK22-NEXT: movl %ebx, 24(%edx)
+; FALLBACK22-NEXT: movl %ecx, 4(%edx)
+; FALLBACK22-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 8(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 12(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 20(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 16(%edx)
+; FALLBACK22-NEXT: addl $92, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: shl_32bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $92, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movups (%ecx), %xmm0
+; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT: movzbl (%eax), %eax
+; FALLBACK23-NEXT: movl %eax, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: xorps %xmm2, %xmm2
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: andb $28, %al
+; FALLBACK23-NEXT: negb %al
+; FALLBACK23-NEXT: movsbl %al, %ebx
+; FALLBACK23-NEXT: movl 60(%esp,%ebx), %esi
+; FALLBACK23-NEXT: movl 64(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shldl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 68(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl %edx, %edi
+; FALLBACK23-NEXT: shldl %cl, %eax, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 56(%esp,%ebx), %edi
+; FALLBACK23-NEXT: shldl %cl, %edi, %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 52(%esp,%ebx), %ebp
+; FALLBACK23-NEXT: shldl %cl, %ebp, %edi
+; FALLBACK23-NEXT: movl 72(%esp,%ebx), %esi
+; FALLBACK23-NEXT: movl %esi, %eax
+; FALLBACK23-NEXT: shldl %cl, %edx, %eax
+; FALLBACK23-NEXT: movl 48(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl 76(%esp,%ebx), %ebx
+; FALLBACK23-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK23-NEXT: movl %ebx, 28(%esi)
+; FALLBACK23-NEXT: movl %eax, 24(%esi)
+; FALLBACK23-NEXT: shlxl %ecx, %edx, %eax
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK23-NEXT: movl %ebp, 4(%esi)
+; FALLBACK23-NEXT: movl %edi, 8(%esi)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 12(%esi)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 20(%esi)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 16(%esi)
+; FALLBACK23-NEXT: movl %eax, (%esi)
+; FALLBACK23-NEXT: addl $92, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: shl_32bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $108, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: movzbl (%eax), %ecx
+; FALLBACK24-NEXT: movb %cl, %dh
+; FALLBACK24-NEXT: shlb $3, %dh
+; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $28, %cl
+; FALLBACK24-NEXT: negb %cl
+; FALLBACK24-NEXT: movsbl %cl, %ebp
+; FALLBACK24-NEXT: movl 80(%esp,%ebp), %eax
+; FALLBACK24-NEXT: movl %eax, %edi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: movb %dh, %dl
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: movl 76(%esp,%ebp), %esi
+; FALLBACK24-NEXT: movl %ebp, %ebx
+; FALLBACK24-NEXT: movl %esi, %ebp
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %edi, %ebp
+; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 84(%esp,%ebx), %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: shrl %eax
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: orl %edi, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl %ebx, %eax
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 72(%esp,%ebx), %ebx
+; FALLBACK24-NEXT: movl %ebx, %edi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %esi, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 68(%esp,%eax), %ebp
+; FALLBACK24-NEXT: movl %ebp, %esi
+; FALLBACK24-NEXT: shrl %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: movl 64(%esp,%eax), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: shrl %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl 88(%esp,%eax), %ebp
+; FALLBACK24-NEXT: movl %ebp, %edi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shrl %eax
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: orl %edi, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl 92(%esp,%eax), %edi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %edi, %ebp
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %edx, (%eax)
+; FALLBACK24-NEXT: movl %ebp, 28(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 24(%eax)
+; FALLBACK24-NEXT: movl %ebx, 4(%eax)
+; FALLBACK24-NEXT: movl %esi, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 16(%eax)
+; FALLBACK24-NEXT: addl $108, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: shl_32bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $92, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT: movzbl (%eax), %eax
+; FALLBACK25-NEXT: movl %eax, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: andb $28, %al
+; FALLBACK25-NEXT: negb %al
+; FALLBACK25-NEXT: movsbl %al, %ebp
+; FALLBACK25-NEXT: movl 60(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 64(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %esi
+; FALLBACK25-NEXT: shldl %cl, %edx, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 68(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl %esi, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edi
+; FALLBACK25-NEXT: shldl %cl, %edi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 52(%esp,%ebp), %ebx
+; FALLBACK25-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl %esi, %edx
+; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT: shldl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl 48(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK25-NEXT: shldl %cl, %esi, %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl %edx, 24(%ebp)
+; FALLBACK25-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK25-NEXT: movl %edx, %eax
+; FALLBACK25-NEXT: shll %cl, %eax
+; FALLBACK25-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK25-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK25-NEXT: movl %edi, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT: movl %ecx, 12(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT: movl %ecx, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT: movl %ecx, 16(%ebp)
+; FALLBACK25-NEXT: movl %eax, (%ebp)
+; FALLBACK25-NEXT: addl $92, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: shl_32bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $92, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %eax
+; FALLBACK26-NEXT: shlb $3, %al
+; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: andb $28, %cl
+; FALLBACK26-NEXT: negb %cl
+; FALLBACK26-NEXT: movsbl %cl, %esi
+; FALLBACK26-NEXT: movl 64(%esp,%esi), %ecx
+; FALLBACK26-NEXT: shlxl %eax, %ecx, %edi
+; FALLBACK26-NEXT: movl %eax, %edx
+; FALLBACK26-NEXT: notb %dl
+; FALLBACK26-NEXT: movl 60(%esp,%esi), %ebp
+; FALLBACK26-NEXT: movl %ebp, %ebx
+; FALLBACK26-NEXT: shrl %ebx
+; FALLBACK26-NEXT: shrxl %edx, %ebx, %ebx
+; FALLBACK26-NEXT: orl %edi, %ebx
+; FALLBACK26-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 68(%esp,%esi), %ebx
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %edx, %ecx, %edi
+; FALLBACK26-NEXT: shlxl %eax, %ebx, %ecx
+; FALLBACK26-NEXT: orl %ecx, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %eax, %ebp, %ecx
+; FALLBACK26-NEXT: movl 56(%esp,%esi), %edi
+; FALLBACK26-NEXT: movl %edi, %ebp
+; FALLBACK26-NEXT: shrl %ebp
+; FALLBACK26-NEXT: shrxl %edx, %ebp, %ebp
+; FALLBACK26-NEXT: orl %ecx, %ebp
+; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %eax, %edi, %ecx
+; FALLBACK26-NEXT: movl 52(%esp,%esi), %edi
+; FALLBACK26-NEXT: movl %edi, %ebp
+; FALLBACK26-NEXT: shrl %ebp
+; FALLBACK26-NEXT: shrxl %edx, %ebp, %ebp
+; FALLBACK26-NEXT: orl %ecx, %ebp
+; FALLBACK26-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %eax, %edi, %ebp
+; FALLBACK26-NEXT: movl 48(%esp,%esi), %edi
+; FALLBACK26-NEXT: movl %edi, %ecx
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %edx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %ebp, %ecx
+; FALLBACK26-NEXT: shlxl %eax, %edi, %edi
+; FALLBACK26-NEXT: shlxl %eax, 76(%esp,%esi), %ebp
+; FALLBACK26-NEXT: movl 72(%esp,%esi), %esi
+; FALLBACK26-NEXT: shlxl %eax, %esi, %eax
+; FALLBACK26-NEXT: shrl %ebx
+; FALLBACK26-NEXT: shrxl %edx, %ebx, %ebx
+; FALLBACK26-NEXT: orl %eax, %ebx
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT: movl %edi, (%edx)
+; FALLBACK26-NEXT: movl %eax, 28(%edx)
+; FALLBACK26-NEXT: movl %ebx, 24(%edx)
+; FALLBACK26-NEXT: movl %ecx, 4(%edx)
+; FALLBACK26-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 8(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 12(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 20(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 16(%edx)
+; FALLBACK26-NEXT: addl $92, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: shl_32bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $92, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT: movzbl (%eax), %eax
+; FALLBACK27-NEXT: movl %eax, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: andb $28, %al
+; FALLBACK27-NEXT: negb %al
+; FALLBACK27-NEXT: movsbl %al, %ebx
+; FALLBACK27-NEXT: movl 60(%esp,%ebx), %esi
+; FALLBACK27-NEXT: movl 64(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shldl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 68(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl %edx, %edi
+; FALLBACK27-NEXT: shldl %cl, %eax, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 56(%esp,%ebx), %edi
+; FALLBACK27-NEXT: shldl %cl, %edi, %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 52(%esp,%ebx), %ebp
+; FALLBACK27-NEXT: shldl %cl, %ebp, %edi
+; FALLBACK27-NEXT: movl 72(%esp,%ebx), %esi
+; FALLBACK27-NEXT: movl %esi, %eax
+; FALLBACK27-NEXT: shldl %cl, %edx, %eax
+; FALLBACK27-NEXT: movl 48(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl 76(%esp,%ebx), %ebx
+; FALLBACK27-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK27-NEXT: movl %ebx, 28(%esi)
+; FALLBACK27-NEXT: movl %eax, 24(%esi)
+; FALLBACK27-NEXT: shlxl %ecx, %edx, %eax
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK27-NEXT: movl %ebp, 4(%esi)
+; FALLBACK27-NEXT: movl %edi, 8(%esi)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 12(%esi)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 20(%esi)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 16(%esi)
+; FALLBACK27-NEXT: movl %eax, (%esi)
+; FALLBACK27-NEXT: addl $92, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: shl_32bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $108, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT: movzbl (%eax), %ecx
+; FALLBACK28-NEXT: movb %cl, %dh
+; FALLBACK28-NEXT: shlb $3, %dh
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $28, %cl
+; FALLBACK28-NEXT: negb %cl
+; FALLBACK28-NEXT: movsbl %cl, %ebp
+; FALLBACK28-NEXT: movl 80(%esp,%ebp), %eax
+; FALLBACK28-NEXT: movl %eax, %edi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: movb %dh, %dl
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: movl 76(%esp,%ebp), %esi
+; FALLBACK28-NEXT: movl %ebp, %ebx
+; FALLBACK28-NEXT: movl %esi, %ebp
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %edi, %ebp
+; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 84(%esp,%ebx), %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: shrl %eax
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: orl %edi, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl %ebx, %eax
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 72(%esp,%ebx), %ebx
+; FALLBACK28-NEXT: movl %ebx, %edi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %esi, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 68(%esp,%eax), %ebp
+; FALLBACK28-NEXT: movl %ebp, %esi
+; FALLBACK28-NEXT: shrl %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: movl 64(%esp,%eax), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: shrl %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl 88(%esp,%eax), %ebp
+; FALLBACK28-NEXT: movl %ebp, %edi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shrl %eax
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: orl %edi, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl 92(%esp,%eax), %edi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %edi, %ebp
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %edx, (%eax)
+; FALLBACK28-NEXT: movl %ebp, 28(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 24(%eax)
+; FALLBACK28-NEXT: movl %ebx, 4(%eax)
+; FALLBACK28-NEXT: movl %esi, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 16(%eax)
+; FALLBACK28-NEXT: addl $108, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: shl_32bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $92, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK29-NEXT: movzbl (%eax), %eax
+; FALLBACK29-NEXT: movl %eax, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: andb $28, %al
+; FALLBACK29-NEXT: negb %al
+; FALLBACK29-NEXT: movsbl %al, %ebp
+; FALLBACK29-NEXT: movl 60(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 64(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %esi
+; FALLBACK29-NEXT: shldl %cl, %edx, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 68(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl %esi, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edi
+; FALLBACK29-NEXT: shldl %cl, %edi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 52(%esp,%ebp), %ebx
+; FALLBACK29-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl %esi, %edx
+; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT: shldl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl 48(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK29-NEXT: shldl %cl, %esi, %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl %edx, 24(%ebp)
+; FALLBACK29-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK29-NEXT: movl %edx, %eax
+; FALLBACK29-NEXT: shll %cl, %eax
+; FALLBACK29-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK29-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK29-NEXT: movl %edi, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT: movl %ecx, 12(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT: movl %ecx, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT: movl %ecx, 16(%ebp)
+; FALLBACK29-NEXT: movl %eax, (%ebp)
+; FALLBACK29-NEXT: addl $92, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: shl_32bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $92, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %eax
+; FALLBACK30-NEXT: shlb $3, %al
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: andb $28, %cl
+; FALLBACK30-NEXT: negb %cl
+; FALLBACK30-NEXT: movsbl %cl, %esi
+; FALLBACK30-NEXT: movl 64(%esp,%esi), %ecx
+; FALLBACK30-NEXT: shlxl %eax, %ecx, %edi
+; FALLBACK30-NEXT: movl %eax, %edx
+; FALLBACK30-NEXT: notb %dl
+; FALLBACK30-NEXT: movl 60(%esp,%esi), %ebp
+; FALLBACK30-NEXT: movl %ebp, %ebx
+; FALLBACK30-NEXT: shrl %ebx
+; FALLBACK30-NEXT: shrxl %edx, %ebx, %ebx
+; FALLBACK30-NEXT: orl %edi, %ebx
+; FALLBACK30-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 68(%esp,%esi), %ebx
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %edx, %ecx, %edi
+; FALLBACK30-NEXT: shlxl %eax, %ebx, %ecx
+; FALLBACK30-NEXT: orl %ecx, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %eax, %ebp, %ecx
+; FALLBACK30-NEXT: movl 56(%esp,%esi), %edi
+; FALLBACK30-NEXT: movl %edi, %ebp
+; FALLBACK30-NEXT: shrl %ebp
+; FALLBACK30-NEXT: shrxl %edx, %ebp, %ebp
+; FALLBACK30-NEXT: orl %ecx, %ebp
+; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %eax, %edi, %ecx
+; FALLBACK30-NEXT: movl 52(%esp,%esi), %edi
+; FALLBACK30-NEXT: movl %edi, %ebp
+; FALLBACK30-NEXT: shrl %ebp
+; FALLBACK30-NEXT: shrxl %edx, %ebp, %ebp
+; FALLBACK30-NEXT: orl %ecx, %ebp
+; FALLBACK30-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %eax, %edi, %ebp
+; FALLBACK30-NEXT: movl 48(%esp,%esi), %edi
+; FALLBACK30-NEXT: movl %edi, %ecx
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %edx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %ebp, %ecx
+; FALLBACK30-NEXT: shlxl %eax, %edi, %edi
+; FALLBACK30-NEXT: shlxl %eax, 76(%esp,%esi), %ebp
+; FALLBACK30-NEXT: movl 72(%esp,%esi), %esi
+; FALLBACK30-NEXT: shlxl %eax, %esi, %eax
+; FALLBACK30-NEXT: shrl %ebx
+; FALLBACK30-NEXT: shrxl %edx, %ebx, %ebx
+; FALLBACK30-NEXT: orl %eax, %ebx
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT: movl %edi, (%edx)
+; FALLBACK30-NEXT: movl %eax, 28(%edx)
+; FALLBACK30-NEXT: movl %ebx, 24(%edx)
+; FALLBACK30-NEXT: movl %ecx, 4(%edx)
+; FALLBACK30-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 8(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 12(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 20(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 16(%edx)
+; FALLBACK30-NEXT: addl $92, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: shl_32bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $92, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK31-NEXT: movzbl (%eax), %eax
+; FALLBACK31-NEXT: movl %eax, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: andb $28, %al
+; FALLBACK31-NEXT: negb %al
+; FALLBACK31-NEXT: movsbl %al, %ebx
+; FALLBACK31-NEXT: movl 60(%esp,%ebx), %esi
+; FALLBACK31-NEXT: movl 64(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shldl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 68(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl %edx, %edi
+; FALLBACK31-NEXT: shldl %cl, %eax, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 56(%esp,%ebx), %edi
+; FALLBACK31-NEXT: shldl %cl, %edi, %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 52(%esp,%ebx), %ebp
+; FALLBACK31-NEXT: shldl %cl, %ebp, %edi
+; FALLBACK31-NEXT: movl 72(%esp,%ebx), %esi
+; FALLBACK31-NEXT: movl %esi, %eax
+; FALLBACK31-NEXT: shldl %cl, %edx, %eax
+; FALLBACK31-NEXT: movl 48(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl 76(%esp,%ebx), %ebx
+; FALLBACK31-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK31-NEXT: movl %ebx, 28(%esi)
+; FALLBACK31-NEXT: movl %eax, 24(%esi)
+; FALLBACK31-NEXT: shlxl %ecx, %edx, %eax
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK31-NEXT: movl %ebp, 4(%esi)
+; FALLBACK31-NEXT: movl %edi, 8(%esi)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 12(%esi)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 20(%esi)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 16(%esi)
+; FALLBACK31-NEXT: movl %eax, (%esi)
+; FALLBACK31-NEXT: addl $92, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
%bitOff = shl i256 %byteOff, 3
@@ -1430,222 +6532,2199 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
ret void
}
define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: ashr_32bytes:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movq (%rdi), %rax
-; X64-SSE2-NEXT: movq 8(%rdi), %rcx
-; X64-SSE2-NEXT: movq 16(%rdi), %r8
-; X64-SSE2-NEXT: movq 24(%rdi), %rdi
-; X64-SSE2-NEXT: movzbl (%rsi), %esi
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: sarq $63, %rdi
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $31, %esi
-; X64-SSE2-NEXT: movq -64(%rsp,%rsi), %rax
-; X64-SSE2-NEXT: movq -56(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi
-; X64-SSE2-NEXT: movq %rsi, 16(%rdx)
-; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT: movq %rax, (%rdx)
-; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
-; X64-SSE2-NEXT: retq
-;
-; X64-SSE42-LABEL: ashr_32bytes:
-; X64-SSE42: # %bb.0:
-; X64-SSE42-NEXT: movups (%rdi), %xmm0
-; X64-SSE42-NEXT: movq 16(%rdi), %rax
-; X64-SSE42-NEXT: movq 24(%rdi), %rcx
-; X64-SSE42-NEXT: movzbl (%rsi), %esi
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: sarq $63, %rcx
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $31, %esi
-; X64-SSE42-NEXT: movups -64(%rsp,%rsi), %xmm0
-; X64-SSE42-NEXT: movups -48(%rsp,%rsi), %xmm1
-; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
-; X64-SSE42-NEXT: movups %xmm0, (%rdx)
-; X64-SSE42-NEXT: retq
-;
-; X64-AVX-LABEL: ashr_32bytes:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovups (%rdi), %xmm0
-; X64-AVX-NEXT: movq 16(%rdi), %rax
-; X64-AVX-NEXT: movq 24(%rdi), %rcx
-; X64-AVX-NEXT: movzbl (%rsi), %esi
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: sarq $63, %rcx
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: andl $31, %esi
-; X64-AVX-NEXT: vmovups -64(%rsp,%rsi), %xmm0
-; X64-AVX-NEXT: vmovups -48(%rsp,%rsi), %xmm1
-; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT: retq
-;
-; X86-SSE2-LABEL: ashr_32bytes:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: pushl %ebx
-; X86-SSE2-NEXT: pushl %edi
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $72, %esp
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl (%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 4(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 8(%eax), %edi
-; X86-SSE2-NEXT: movl 12(%eax), %ebx
-; X86-SSE2-NEXT: movl 16(%eax), %ebp
-; X86-SSE2-NEXT: movl 20(%eax), %esi
-; X86-SSE2-NEXT: movl 24(%eax), %edx
-; X86-SSE2-NEXT: movl 28(%eax), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movzbl (%eax), %eax
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: sarl $31, %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $31, %eax
-; X86-SSE2-NEXT: movl 8(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 12(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 20(%esp,%eax), %esi
-; X86-SSE2-NEXT: movl 16(%esp,%eax), %edi
-; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebx
-; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebp
-; X86-SSE2-NEXT: movl 36(%esp,%eax), %edx
-; X86-SSE2-NEXT: movl 32(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl %ecx, 24(%eax)
-; X86-SSE2-NEXT: movl %edx, 28(%eax)
-; X86-SSE2-NEXT: movl %ebp, 16(%eax)
-; X86-SSE2-NEXT: movl %ebx, 20(%eax)
-; X86-SSE2-NEXT: movl %edi, 8(%eax)
-; X86-SSE2-NEXT: movl %esi, 12(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, (%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $72, %esp
-; X86-SSE2-NEXT: popl %esi
-; X86-SSE2-NEXT: popl %edi
-; X86-SSE2-NEXT: popl %ebx
-; X86-SSE2-NEXT: popl %ebp
-; X86-SSE2-NEXT: retl
-;
-; X86-SSE42-LABEL: ashr_32bytes:
-; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: pushl %ebx
-; X86-SSE42-NEXT: pushl %edi
-; X86-SSE42-NEXT: pushl %esi
-; X86-SSE42-NEXT: subl $64, %esp
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT: movups (%edx), %xmm0
-; X86-SSE42-NEXT: movl 16(%edx), %esi
-; X86-SSE42-NEXT: movl 20(%edx), %edi
-; X86-SSE42-NEXT: movl 24(%edx), %ebx
-; X86-SSE42-NEXT: movl 28(%edx), %edx
-; X86-SSE42-NEXT: movzbl (%ecx), %ecx
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
-; X86-SSE42-NEXT: sarl $31, %edx
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andl $31, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
-; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $64, %esp
-; X86-SSE42-NEXT: popl %esi
-; X86-SSE42-NEXT: popl %edi
-; X86-SSE42-NEXT: popl %ebx
-; X86-SSE42-NEXT: retl
-;
-; X86-AVX-LABEL: ashr_32bytes:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: pushl %ebx
-; X86-AVX-NEXT: pushl %edi
-; X86-AVX-NEXT: pushl %esi
-; X86-AVX-NEXT: subl $64, %esp
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: vmovups (%edx), %xmm0
-; X86-AVX-NEXT: movl 16(%edx), %esi
-; X86-AVX-NEXT: movl 20(%edx), %edi
-; X86-AVX-NEXT: movl 24(%edx), %ebx
-; X86-AVX-NEXT: movl 28(%edx), %edx
-; X86-AVX-NEXT: movzbl (%ecx), %ecx
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovups %xmm0, (%esp)
-; X86-AVX-NEXT: sarl $31, %edx
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andl $31, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
-; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $64, %esp
-; X86-AVX-NEXT: popl %esi
-; X86-AVX-NEXT: popl %edi
-; X86-AVX-NEXT: popl %ebx
-; X86-AVX-NEXT: retl
+; FALLBACK0-LABEL: ashr_32bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq 16(%rdi), %rcx
+; FALLBACK0-NEXT: movq (%rdi), %r8
+; FALLBACK0-NEXT: movq 8(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: leal (,%rsi,8), %eax
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: sarq $63, %rdi
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: andb $24, %sil
+; FALLBACK0-NEXT: movzbl %sil, %r9d
+; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi
+; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r8
+; FALLBACK0-NEXT: movq %rdi, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r11
+; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r9
+; FALLBACK0-NEXT: orq %r10, %r9
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: addq %rdi, %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rdi
+; FALLBACK0-NEXT: orq %r11, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: leaq (%r8,%r8), %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %rbx, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: sarq %cl, %r8
+; FALLBACK0-NEXT: movq %r8, 24(%rdx)
+; FALLBACK0-NEXT: movq %r10, 16(%rdx)
+; FALLBACK0-NEXT: movq %rdi, (%rdx)
+; FALLBACK0-NEXT: movq %r9, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: ashr_32bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq 24(%rdi), %rax
+; FALLBACK1-NEXT: movq (%rdi), %r8
+; FALLBACK1-NEXT: movq 8(%rdi), %r9
+; FALLBACK1-NEXT: movq 16(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: sarq $63, %rax
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: andb $24, %sil
+; FALLBACK1-NEXT: movzbl %sil, %eax
+; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8
+; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax
+; FALLBACK1-NEXT: movq %r8, %r9
+; FALLBACK1-NEXT: shrdq %cl, %rdi, %r9
+; FALLBACK1-NEXT: shrdq %cl, %r8, %rsi
+; FALLBACK1-NEXT: shrdq %cl, %rax, %rdi
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: sarq %cl, %rax
+; FALLBACK1-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rax, 24(%rdx)
+; FALLBACK1-NEXT: movq %rsi, (%rdx)
+; FALLBACK1-NEXT: movq %r9, 8(%rdx)
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: ashr_32bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq 16(%rdi), %rcx
+; FALLBACK2-NEXT: movq (%rdi), %r8
+; FALLBACK2-NEXT: movq 8(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: leal (,%rsi,8), %eax
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: sarq $63, %rdi
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: andb $24, %sil
+; FALLBACK2-NEXT: movzbl %sil, %ecx
+; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi
+; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rdi
+; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %r9
+; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %rcx
+; FALLBACK2-NEXT: shrxq %rax, %r9, %r10
+; FALLBACK2-NEXT: sarxq %rax, %rdi, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: addq %r9, %r9
+; FALLBACK2-NEXT: shlxq %rax, %r9, %r9
+; FALLBACK2-NEXT: orq %r8, %r9
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT: orq %rcx, %rsi
+; FALLBACK2-NEXT: leaq (%rdi,%rdi), %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, 24(%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rsi, (%rdx)
+; FALLBACK2-NEXT: movq %r9, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: ashr_32bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq 24(%rdi), %rax
+; FALLBACK3-NEXT: movq (%rdi), %r8
+; FALLBACK3-NEXT: movq 8(%rdi), %r9
+; FALLBACK3-NEXT: movq 16(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: sarq $63, %rax
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: andb $24, %sil
+; FALLBACK3-NEXT: movzbl %sil, %eax
+; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8
+; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax
+; FALLBACK3-NEXT: movq %r8, %r9
+; FALLBACK3-NEXT: shrdq %cl, %rdi, %r9
+; FALLBACK3-NEXT: shrdq %cl, %r8, %rsi
+; FALLBACK3-NEXT: shrdq %cl, %rax, %rdi
+; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rax, 24(%rdx)
+; FALLBACK3-NEXT: movq %rsi, (%rdx)
+; FALLBACK3-NEXT: movq %r9, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: ashr_32bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: movq 24(%rdi), %rcx
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movq 16(%rdi), %rdi
+; FALLBACK4-NEXT: movzbl (%rsi), %esi
+; FALLBACK4-NEXT: leal (,%rsi,8), %eax
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: sarq $63, %rcx
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: andb $24, %sil
+; FALLBACK4-NEXT: movzbl %sil, %r8d
+; FALLBACK4-NEXT: movq -64(%rsp,%r8), %r9
+; FALLBACK4-NEXT: movq -48(%rsp,%r8), %rdi
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r9
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: movq -56(%rsp,%r8), %r10
+; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r11
+; FALLBACK4-NEXT: leaq (%r10,%r10), %r8
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r8
+; FALLBACK4-NEXT: orq %r9, %r8
+; FALLBACK4-NEXT: movq %rdi, %r9
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r9
+; FALLBACK4-NEXT: leaq (%r11,%r11), %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r9, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: addq %rdi, %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rdi
+; FALLBACK4-NEXT: orq %r10, %rdi
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: sarq %cl, %r11
+; FALLBACK4-NEXT: movq %r11, 24(%rdx)
+; FALLBACK4-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT: movq %r8, (%rdx)
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: ashr_32bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movq 16(%rdi), %rax
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movq 24(%rdi), %rdi
+; FALLBACK5-NEXT: movzbl (%rsi), %esi
+; FALLBACK5-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: sarq $63, %rdi
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: andb $24, %sil
+; FALLBACK5-NEXT: movzbl %sil, %eax
+; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rsi
+; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r8
+; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rax
+; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT: movq %rax, %r9
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %r9
+; FALLBACK5-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: sarq %cl, %rdi
+; FALLBACK5-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK5-NEXT: movq %r9, 16(%rdx)
+; FALLBACK5-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r8, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: ashr_32bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movq 24(%rdi), %rcx
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movq 16(%rdi), %rdi
+; FALLBACK6-NEXT: movzbl (%rsi), %esi
+; FALLBACK6-NEXT: leal (,%rsi,8), %eax
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: sarq $63, %rcx
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: andb $24, %sil
+; FALLBACK6-NEXT: movzbl %sil, %ecx
+; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %r8
+; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rcx, %r9
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT: sarxq %rax, %r8, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: leaq (%r8,%r8), %rsi
+; FALLBACK6-NEXT: shlxq %rax, %rsi, %rsi
+; FALLBACK6-NEXT: orq %r9, %rsi
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, (%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: ashr_32bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movq 16(%rdi), %rax
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movq 24(%rdi), %rdi
+; FALLBACK7-NEXT: movzbl (%rsi), %esi
+; FALLBACK7-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: sarq $63, %rdi
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: andb $24, %sil
+; FALLBACK7-NEXT: movzbl %sil, %eax
+; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rsi
+; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r8
+; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rax
+; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT: movq %rax, %r9
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %r9
+; FALLBACK7-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK7-NEXT: sarxq %rcx, %rdi, %rax
+; FALLBACK7-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK7-NEXT: movq %r9, 16(%rdx)
+; FALLBACK7-NEXT: movq %rax, 24(%rdx)
+; FALLBACK7-NEXT: movq %r8, (%rdx)
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: ashr_32bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: movq 24(%rdi), %rcx
+; FALLBACK8-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK8-NEXT: movq 16(%rdi), %rdi
+; FALLBACK8-NEXT: movzbl (%rsi), %esi
+; FALLBACK8-NEXT: leal (,%rsi,8), %eax
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: sarq $63, %rcx
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: andb $24, %sil
+; FALLBACK8-NEXT: movzbl %sil, %r8d
+; FALLBACK8-NEXT: movq -64(%rsp,%r8), %r9
+; FALLBACK8-NEXT: movq -48(%rsp,%r8), %rdi
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r9
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: movq -56(%rsp,%r8), %r10
+; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r11
+; FALLBACK8-NEXT: leaq (%r10,%r10), %r8
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r8
+; FALLBACK8-NEXT: orq %r9, %r8
+; FALLBACK8-NEXT: movq %rdi, %r9
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r9
+; FALLBACK8-NEXT: leaq (%r11,%r11), %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r9, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: addq %rdi, %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rdi
+; FALLBACK8-NEXT: orq %r10, %rdi
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: sarq %cl, %r11
+; FALLBACK8-NEXT: movq %r11, 24(%rdx)
+; FALLBACK8-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT: movq %r8, (%rdx)
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: ashr_32bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: movq 16(%rdi), %rax
+; FALLBACK9-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK9-NEXT: movq 24(%rdi), %rdi
+; FALLBACK9-NEXT: movzbl (%rsi), %esi
+; FALLBACK9-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: sarq $63, %rdi
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: andb $24, %sil
+; FALLBACK9-NEXT: movzbl %sil, %eax
+; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rsi
+; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r8
+; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rax
+; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT: movq %rax, %r9
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %r9
+; FALLBACK9-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: sarq %cl, %rdi
+; FALLBACK9-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK9-NEXT: movq %r9, 16(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r8, (%rdx)
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: ashr_32bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: movq 24(%rdi), %rcx
+; FALLBACK10-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK10-NEXT: movq 16(%rdi), %rdi
+; FALLBACK10-NEXT: movzbl (%rsi), %esi
+; FALLBACK10-NEXT: leal (,%rsi,8), %eax
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: sarq $63, %rcx
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: andb $24, %sil
+; FALLBACK10-NEXT: movzbl %sil, %ecx
+; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %r8
+; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rcx, %r9
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT: sarxq %rax, %r8, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: leaq (%r8,%r8), %rsi
+; FALLBACK10-NEXT: shlxq %rax, %rsi, %rsi
+; FALLBACK10-NEXT: orq %r9, %rsi
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, (%rdx)
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: ashr_32bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: movq 16(%rdi), %rax
+; FALLBACK11-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK11-NEXT: movq 24(%rdi), %rdi
+; FALLBACK11-NEXT: movzbl (%rsi), %esi
+; FALLBACK11-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: sarq $63, %rdi
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: andb $24, %sil
+; FALLBACK11-NEXT: movzbl %sil, %eax
+; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rsi
+; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r8
+; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rax
+; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT: movq %rax, %r9
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %r9
+; FALLBACK11-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK11-NEXT: sarxq %rcx, %rdi, %rax
+; FALLBACK11-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK11-NEXT: movq %r9, 16(%rdx)
+; FALLBACK11-NEXT: movq %rax, 24(%rdx)
+; FALLBACK11-NEXT: movq %r8, (%rdx)
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: ashr_32bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: movq 24(%rdi), %rcx
+; FALLBACK12-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK12-NEXT: movq 16(%rdi), %rdi
+; FALLBACK12-NEXT: movzbl (%rsi), %esi
+; FALLBACK12-NEXT: leal (,%rsi,8), %eax
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: sarq $63, %rcx
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: andb $24, %sil
+; FALLBACK12-NEXT: movzbl %sil, %r8d
+; FALLBACK12-NEXT: movq -64(%rsp,%r8), %r9
+; FALLBACK12-NEXT: movq -48(%rsp,%r8), %rdi
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r9
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: movq -56(%rsp,%r8), %r10
+; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r11
+; FALLBACK12-NEXT: leaq (%r10,%r10), %r8
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r8
+; FALLBACK12-NEXT: orq %r9, %r8
+; FALLBACK12-NEXT: movq %rdi, %r9
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r9
+; FALLBACK12-NEXT: leaq (%r11,%r11), %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r9, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: addq %rdi, %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rdi
+; FALLBACK12-NEXT: orq %r10, %rdi
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: sarq %cl, %r11
+; FALLBACK12-NEXT: movq %r11, 24(%rdx)
+; FALLBACK12-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT: movq %r8, (%rdx)
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: ashr_32bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: movq 16(%rdi), %rax
+; FALLBACK13-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK13-NEXT: movq 24(%rdi), %rdi
+; FALLBACK13-NEXT: movzbl (%rsi), %esi
+; FALLBACK13-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: sarq $63, %rdi
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: andb $24, %sil
+; FALLBACK13-NEXT: movzbl %sil, %eax
+; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rsi
+; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rdi
+; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r8
+; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rax
+; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT: movq %rax, %r9
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %r9
+; FALLBACK13-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: sarq %cl, %rdi
+; FALLBACK13-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK13-NEXT: movq %r9, 16(%rdx)
+; FALLBACK13-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r8, (%rdx)
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: ashr_32bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: movq 24(%rdi), %rcx
+; FALLBACK14-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK14-NEXT: movq 16(%rdi), %rdi
+; FALLBACK14-NEXT: movzbl (%rsi), %esi
+; FALLBACK14-NEXT: leal (,%rsi,8), %eax
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: sarq $63, %rcx
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: andb $24, %sil
+; FALLBACK14-NEXT: movzbl %sil, %ecx
+; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %r8
+; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rcx, %r9
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT: sarxq %rax, %r8, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: leaq (%r8,%r8), %rsi
+; FALLBACK14-NEXT: shlxq %rax, %rsi, %rsi
+; FALLBACK14-NEXT: orq %r9, %rsi
+; FALLBACK14-NEXT: addq %rcx, %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, (%rdx)
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: ashr_32bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: movq 16(%rdi), %rax
+; FALLBACK15-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK15-NEXT: movq 24(%rdi), %rdi
+; FALLBACK15-NEXT: movzbl (%rsi), %esi
+; FALLBACK15-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: sarq $63, %rdi
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: andb $24, %sil
+; FALLBACK15-NEXT: movzbl %sil, %eax
+; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rsi
+; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r8
+; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rax
+; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT: movq %rax, %r9
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %r9
+; FALLBACK15-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK15-NEXT: sarxq %rcx, %rdi, %rax
+; FALLBACK15-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK15-NEXT: movq %r9, 16(%rdx)
+; FALLBACK15-NEXT: movq %rax, 24(%rdx)
+; FALLBACK15-NEXT: movq %r8, (%rdx)
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: ashr_32bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $108, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl 12(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 16(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%eax), %edi
+; FALLBACK16-NEXT: movl 24(%eax), %ebx
+; FALLBACK16-NEXT: movl 28(%eax), %esi
+; FALLBACK16-NEXT: movzbl (%edx), %edx
+; FALLBACK16-NEXT: movl (%eax), %ebp
+; FALLBACK16-NEXT: movl 4(%eax), %ecx
+; FALLBACK16-NEXT: movl 20(%eax), %eax
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movb %dl, %dh
+; FALLBACK16-NEXT: shlb $3, %dh
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: sarl $31, %esi
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $28, %dl
+; FALLBACK16-NEXT: movzbl %dl, %ebx
+; FALLBACK16-NEXT: movl 36(%esp,%ebx), %esi
+; FALLBACK16-NEXT: movl %esi, %eax
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movb %dh, %dl
+; FALLBACK16-NEXT: notb %dl
+; FALLBACK16-NEXT: movl 40(%esp,%ebx), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %eax, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%esp,%ebx), %eax
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %esi, %esi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %eax, %esi
+; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 44(%esp,%ebx), %ebp
+; FALLBACK16-NEXT: movl %ebp, %eax
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 48(%esp,%ebx), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%ecx,%ecx), %esi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %eax, %esi
+; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %ebp, %ebp
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movl %ebx, %eax
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 52(%esp,%ebx), %esi
+; FALLBACK16-NEXT: movl %esi, %ebx
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl 56(%esp,%eax), %eax
+; FALLBACK16-NEXT: leal (%eax,%eax), %edi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %ebx, %edi
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: addl %esi, %esi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %ebx, %esi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl 60(%esp,%ecx), %ebx
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %eax
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: sarl %cl, %ebx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl %ebx, 28(%ecx)
+; FALLBACK16-NEXT: movl %eax, 24(%ecx)
+; FALLBACK16-NEXT: movl %esi, 16(%ecx)
+; FALLBACK16-NEXT: movl %edi, 20(%ecx)
+; FALLBACK16-NEXT: movl %ebp, 8(%ecx)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, 12(%ecx)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, (%ecx)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, 4(%ecx)
+; FALLBACK16-NEXT: addl $108, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: ashr_32bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $92, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl 12(%ecx), %edx
+; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%ecx), %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl (%ecx), %edi
+; FALLBACK17-NEXT: movl 4(%ecx), %ebx
+; FALLBACK17-NEXT: movl 20(%ecx), %ebp
+; FALLBACK17-NEXT: movzbl (%eax), %edx
+; FALLBACK17-NEXT: movl 28(%ecx), %esi
+; FALLBACK17-NEXT: movl 8(%ecx), %eax
+; FALLBACK17-NEXT: movl 24(%ecx), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, %ecx
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: sarl $31, %esi
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: andb $28, %dl
+; FALLBACK17-NEXT: movzbl %dl, %eax
+; FALLBACK17-NEXT: movl 24(%esp,%eax), %esi
+; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%esp,%eax), %edi
+; FALLBACK17-NEXT: movl 20(%esp,%eax), %edx
+; FALLBACK17-NEXT: movl 36(%esp,%eax), %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl %edx, %ebx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%esp,%eax), %esi
+; FALLBACK17-NEXT: shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%esp,%eax), %ebx
+; FALLBACK17-NEXT: movl 28(%esp,%eax), %ebp
+; FALLBACK17-NEXT: movl 44(%esp,%eax), %edx
+; FALLBACK17-NEXT: movl %ebp, %esi
+; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi
+; FALLBACK17-NEXT: shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, %ebp
+; FALLBACK17-NEXT: shrdl %cl, %edi, %ebp
+; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl %edi, 24(%eax)
+; FALLBACK17-NEXT: sarl %cl, %edx
+; FALLBACK17-NEXT: movl %edx, 28(%eax)
+; FALLBACK17-NEXT: movl %ebx, 16(%eax)
+; FALLBACK17-NEXT: movl %ebp, 20(%eax)
+; FALLBACK17-NEXT: movl (%esp), %ecx # 4-byte Reload
+; FALLBACK17-NEXT: movl %ecx, 8(%eax)
+; FALLBACK17-NEXT: movl %esi, 12(%eax)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT: movl %ecx, (%eax)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT: movl %ecx, 4(%eax)
+; FALLBACK17-NEXT: addl $92, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: ashr_32bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $108, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl 12(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 16(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%eax), %edi
+; FALLBACK18-NEXT: movl 24(%eax), %esi
+; FALLBACK18-NEXT: movl 28(%eax), %ebp
+; FALLBACK18-NEXT: movzbl (%edx), %edx
+; FALLBACK18-NEXT: movl (%eax), %ebx
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl 20(%eax), %eax
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, %eax
+; FALLBACK18-NEXT: shlb $3, %al
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: sarl $31, %ebp
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: andb $28, %dl
+; FALLBACK18-NEXT: movzbl %dl, %esi
+; FALLBACK18-NEXT: movl 36(%esp,%esi), %edi
+; FALLBACK18-NEXT: shrxl %eax, %edi, %ebx
+; FALLBACK18-NEXT: movl %eax, %edx
+; FALLBACK18-NEXT: notb %dl
+; FALLBACK18-NEXT: movl 40(%esp,%esi), %ecx
+; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp
+; FALLBACK18-NEXT: shlxl %edx, %ebp, %ebp
+; FALLBACK18-NEXT: orl %ebx, %ebp
+; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%esi), %ebx
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK18-NEXT: orl %ebx, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 48(%esp,%esi), %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK18-NEXT: shlxl %edx, %ebx, %edi
+; FALLBACK18-NEXT: movl 44(%esp,%esi), %ebp
+; FALLBACK18-NEXT: shrxl %eax, %ebp, %ebx
+; FALLBACK18-NEXT: orl %ebx, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %eax, %ecx, %ecx
+; FALLBACK18-NEXT: movl %eax, %ebx
+; FALLBACK18-NEXT: addl %ebp, %ebp
+; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: orl %ecx, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 56(%esp,%esi), %ebp
+; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT: movl 52(%esp,%esi), %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %edi
+; FALLBACK18-NEXT: orl %edi, %ecx
+; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: addl %eax, %eax
+; FALLBACK18-NEXT: shlxl %edx, %eax, %edi
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT: movl 60(%esp,%esi), %esi
+; FALLBACK18-NEXT: sarxl %ebx, %esi, %ebx
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %edx
+; FALLBACK18-NEXT: orl %eax, %edx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl %ebx, 28(%eax)
+; FALLBACK18-NEXT: movl %edx, 24(%eax)
+; FALLBACK18-NEXT: movl %edi, 16(%eax)
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, (%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $108, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: ashr_32bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $92, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: movl 12(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl (%eax), %edi
+; FALLBACK19-NEXT: movl 4(%eax), %ebx
+; FALLBACK19-NEXT: movl 20(%eax), %esi
+; FALLBACK19-NEXT: movzbl (%edx), %edx
+; FALLBACK19-NEXT: movl 28(%eax), %ebp
+; FALLBACK19-NEXT: movl 8(%eax), %ecx
+; FALLBACK19-NEXT: movl 24(%eax), %eax
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: sarl $31, %ebp
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: andb $28, %dl
+; FALLBACK19-NEXT: movzbl %dl, %eax
+; FALLBACK19-NEXT: movl 24(%esp,%eax), %esi
+; FALLBACK19-NEXT: movl 40(%esp,%eax), %ebp
+; FALLBACK19-NEXT: movl 20(%esp,%eax), %edx
+; FALLBACK19-NEXT: movl 36(%esp,%eax), %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl %edx, %edi
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%esp,%eax), %edi
+; FALLBACK19-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%esp,%eax), %ebx
+; FALLBACK19-NEXT: movl 28(%esp,%eax), %edx
+; FALLBACK19-NEXT: movl 44(%esp,%eax), %eax
+; FALLBACK19-NEXT: movl %edx, %edi
+; FALLBACK19-NEXT: shrdl %cl, %ebx, %edi
+; FALLBACK19-NEXT: shrdl %cl, %edx, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, %esi
+; FALLBACK19-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK19-NEXT: sarxl %ecx, %eax, %edx
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl %ebp, 24(%ecx)
+; FALLBACK19-NEXT: movl %edx, 28(%ecx)
+; FALLBACK19-NEXT: movl %ebx, 16(%ecx)
+; FALLBACK19-NEXT: movl %esi, 20(%ecx)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 8(%ecx)
+; FALLBACK19-NEXT: movl %edi, 12(%ecx)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, (%ecx)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 4(%ecx)
+; FALLBACK19-NEXT: addl $92, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: ashr_32bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $108, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edi
+; FALLBACK20-NEXT: movl 24(%edi), %ecx
+; FALLBACK20-NEXT: movl 20(%edi), %eax
+; FALLBACK20-NEXT: movl 28(%edi), %esi
+; FALLBACK20-NEXT: movups (%edi), %xmm0
+; FALLBACK20-NEXT: movl 16(%edi), %edi
+; FALLBACK20-NEXT: movzbl (%edx), %edx
+; FALLBACK20-NEXT: movb %dl, %dh
+; FALLBACK20-NEXT: shlb $3, %dh
+; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: sarl $31, %esi
+; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $28, %dl
+; FALLBACK20-NEXT: movzbl %dl, %ebx
+; FALLBACK20-NEXT: movl 32(%esp,%ebx), %eax
+; FALLBACK20-NEXT: movl 48(%esp,%ebx), %ebp
+; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: movb %dh, %dl
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: movl 36(%esp,%ebx), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %eax, %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 44(%esp,%ebx), %edi
+; FALLBACK20-NEXT: movl %edi, %eax
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: leal (%ebp,%ebp), %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %eax, %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 40(%esp,%ebx), %esi
+; FALLBACK20-NEXT: movl %esi, %eax
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: addl %edi, %edi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl %eax, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 52(%esp,%ebx), %ebp
+; FALLBACK20-NEXT: movl %ebp, %eax
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: movl 56(%esp,%ebx), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl %eax, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: addl %ebp, %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %eax, %ebp
+; FALLBACK20-NEXT: movl 60(%esp,%ebx), %ebx
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: leal (%ebx,%ebx), %eax
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %edi, %eax
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %edi, %esi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: sarl %cl, %ebx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movl %ebx, 28(%ecx)
+; FALLBACK20-NEXT: movl %esi, 4(%ecx)
+; FALLBACK20-NEXT: movl %eax, 24(%ecx)
+; FALLBACK20-NEXT: movl %ebp, 16(%ecx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, 20(%ecx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, 8(%ecx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, 12(%ecx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, (%ecx)
+; FALLBACK20-NEXT: addl $108, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: ashr_32bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $108, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT: movl 24(%edx), %eax
+; FALLBACK21-NEXT: movl 16(%edx), %esi
+; FALLBACK21-NEXT: movl 28(%edx), %edi
+; FALLBACK21-NEXT: movups (%edx), %xmm0
+; FALLBACK21-NEXT: movl 20(%edx), %ebx
+; FALLBACK21-NEXT: movzbl (%ecx), %edx
+; FALLBACK21-NEXT: movl %edx, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: sarl $31, %edi
+; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: andb $28, %dl
+; FALLBACK21-NEXT: movzbl %dl, %eax
+; FALLBACK21-NEXT: movl 36(%esp,%eax), %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 32(%esp,%eax), %edx
+; FALLBACK21-NEXT: movl 48(%esp,%eax), %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 44(%esp,%eax), %esi
+; FALLBACK21-NEXT: movl %esi, %ebx
+; FALLBACK21-NEXT: shrdl %cl, %edi, %ebx
+; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 52(%esp,%eax), %edi
+; FALLBACK21-NEXT: movl 60(%esp,%eax), %ebx
+; FALLBACK21-NEXT: movl 40(%esp,%eax), %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 56(%esp,%eax), %eax
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, %ebp
+; FALLBACK21-NEXT: movl %edi, %edx
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK21-NEXT: shrdl %cl, %ebx, %eax
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK21-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edi
+; FALLBACK21-NEXT: movl %esi, 4(%edi)
+; FALLBACK21-NEXT: movl %eax, 24(%edi)
+; FALLBACK21-NEXT: sarl %cl, %ebx
+; FALLBACK21-NEXT: movl %ebx, 28(%edi)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%edi)
+; FALLBACK21-NEXT: movl %edx, 20(%edi)
+; FALLBACK21-NEXT: movl %ebp, 8(%edi)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%edi)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, (%edi)
+; FALLBACK21-NEXT: addl $108, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: ashr_32bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $108, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movl 24(%ecx), %edx
+; FALLBACK22-NEXT: movl 20(%ecx), %esi
+; FALLBACK22-NEXT: movl 28(%ecx), %edi
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movl 16(%ecx), %ebx
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: shlb $3, %al
+; FALLBACK22-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: sarl $31, %edi
+; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: andb $28, %cl
+; FALLBACK22-NEXT: movzbl %cl, %edi
+; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %eax, %edx
+; FALLBACK22-NEXT: movl %eax, %ebx
+; FALLBACK22-NEXT: notb %dl
+; FALLBACK22-NEXT: movl 36(%esp,%edi), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %esi
+; FALLBACK22-NEXT: shlxl %edx, %esi, %eax
+; FALLBACK22-NEXT: orl %ecx, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 48(%esp,%edi), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %eax
+; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %ebx, %esi
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ebx
+; FALLBACK22-NEXT: orl %ebx, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %eax
+; FALLBACK22-NEXT: movl 40(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %esi, %ecx, %ebx
+; FALLBACK22-NEXT: movl %esi, %ecx
+; FALLBACK22-NEXT: orl %ebx, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 56(%esp,%edi), %ebx
+; FALLBACK22-NEXT: leal (%ebx,%ebx), %ebp
+; FALLBACK22-NEXT: shlxl %edx, %ebp, %ebp
+; FALLBACK22-NEXT: movl 52(%esp,%edi), %eax
+; FALLBACK22-NEXT: shrxl %esi, %eax, %esi
+; FALLBACK22-NEXT: orl %esi, %ebp
+; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %eax, %eax
+; FALLBACK22-NEXT: shlxl %edx, %eax, %esi
+; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrxl %ecx, %ebx, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK22-NEXT: sarxl %ecx, %edi, %ebx
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %eax, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl %ebx, 28(%eax)
+; FALLBACK22-NEXT: movl %ecx, 4(%eax)
+; FALLBACK22-NEXT: movl %edi, 24(%eax)
+; FALLBACK22-NEXT: movl %esi, 16(%eax)
+; FALLBACK22-NEXT: movl %ebp, 20(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 8(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 12(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, (%eax)
+; FALLBACK22-NEXT: addl $108, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: ashr_32bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $108, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT: movl 24(%edx), %eax
+; FALLBACK23-NEXT: movl 16(%edx), %esi
+; FALLBACK23-NEXT: movl 28(%edx), %edi
+; FALLBACK23-NEXT: movups (%edx), %xmm0
+; FALLBACK23-NEXT: movl 20(%edx), %ebx
+; FALLBACK23-NEXT: movzbl (%ecx), %edx
+; FALLBACK23-NEXT: movl %edx, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: sarl $31, %edi
+; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: andb $28, %dl
+; FALLBACK23-NEXT: movzbl %dl, %eax
+; FALLBACK23-NEXT: movl 36(%esp,%eax), %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 32(%esp,%eax), %edi
+; FALLBACK23-NEXT: movl 48(%esp,%eax), %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 44(%esp,%eax), %edi
+; FALLBACK23-NEXT: movl %edi, %ebx
+; FALLBACK23-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 52(%esp,%eax), %esi
+; FALLBACK23-NEXT: movl 60(%esp,%eax), %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 40(%esp,%eax), %ebx
+; FALLBACK23-NEXT: movl 56(%esp,%eax), %ebp
+; FALLBACK23-NEXT: movl %ebx, %eax
+; FALLBACK23-NEXT: shrdl %cl, %edi, %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl %esi, %eax
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK23-NEXT: sarxl %ecx, %eax, %edi
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %ebx, %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movl %eax, 4(%ecx)
+; FALLBACK23-NEXT: movl %ebp, 24(%ecx)
+; FALLBACK23-NEXT: movl %edi, 28(%ecx)
+; FALLBACK23-NEXT: movl %edx, 16(%ecx)
+; FALLBACK23-NEXT: movl %esi, 20(%ecx)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 8(%ecx)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 12(%ecx)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, (%ecx)
+; FALLBACK23-NEXT: addl $108, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: ashr_32bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $108, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edi
+; FALLBACK24-NEXT: movl 24(%edi), %ecx
+; FALLBACK24-NEXT: movl 20(%edi), %eax
+; FALLBACK24-NEXT: movl 28(%edi), %esi
+; FALLBACK24-NEXT: vmovups (%edi), %xmm0
+; FALLBACK24-NEXT: movl 16(%edi), %edi
+; FALLBACK24-NEXT: movzbl (%edx), %edx
+; FALLBACK24-NEXT: movb %dl, %dh
+; FALLBACK24-NEXT: shlb $3, %dh
+; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: sarl $31, %esi
+; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $28, %dl
+; FALLBACK24-NEXT: movzbl %dl, %ebx
+; FALLBACK24-NEXT: movl 32(%esp,%ebx), %eax
+; FALLBACK24-NEXT: movl 48(%esp,%ebx), %ebp
+; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: movb %dh, %dl
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: movl 36(%esp,%ebx), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %eax, %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 44(%esp,%ebx), %edi
+; FALLBACK24-NEXT: movl %edi, %eax
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: leal (%ebp,%ebp), %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %eax, %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 40(%esp,%ebx), %esi
+; FALLBACK24-NEXT: movl %esi, %eax
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: addl %edi, %edi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl %eax, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 52(%esp,%ebx), %ebp
+; FALLBACK24-NEXT: movl %ebp, %eax
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: movl 56(%esp,%ebx), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl %eax, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: addl %ebp, %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %eax, %ebp
+; FALLBACK24-NEXT: movl 60(%esp,%ebx), %ebx
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: leal (%ebx,%ebx), %eax
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %edi, %eax
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %edi, %esi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: sarl %cl, %ebx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: movl %ebx, 28(%ecx)
+; FALLBACK24-NEXT: movl %esi, 4(%ecx)
+; FALLBACK24-NEXT: movl %eax, 24(%ecx)
+; FALLBACK24-NEXT: movl %ebp, 16(%ecx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, 20(%ecx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, 8(%ecx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, 12(%ecx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, (%ecx)
+; FALLBACK24-NEXT: addl $108, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: ashr_32bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $108, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT: movl 24(%edx), %eax
+; FALLBACK25-NEXT: movl 16(%edx), %esi
+; FALLBACK25-NEXT: movl 28(%edx), %edi
+; FALLBACK25-NEXT: vmovups (%edx), %xmm0
+; FALLBACK25-NEXT: movl 20(%edx), %ebx
+; FALLBACK25-NEXT: movzbl (%ecx), %edx
+; FALLBACK25-NEXT: movl %edx, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: sarl $31, %edi
+; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: andb $28, %dl
+; FALLBACK25-NEXT: movzbl %dl, %eax
+; FALLBACK25-NEXT: movl 36(%esp,%eax), %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 32(%esp,%eax), %edx
+; FALLBACK25-NEXT: movl 48(%esp,%eax), %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 44(%esp,%eax), %esi
+; FALLBACK25-NEXT: movl %esi, %ebx
+; FALLBACK25-NEXT: shrdl %cl, %edi, %ebx
+; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 52(%esp,%eax), %edi
+; FALLBACK25-NEXT: movl 60(%esp,%eax), %ebx
+; FALLBACK25-NEXT: movl 40(%esp,%eax), %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 56(%esp,%eax), %eax
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, %ebp
+; FALLBACK25-NEXT: movl %edi, %edx
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK25-NEXT: shrdl %cl, %ebx, %eax
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK25-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edi
+; FALLBACK25-NEXT: movl %esi, 4(%edi)
+; FALLBACK25-NEXT: movl %eax, 24(%edi)
+; FALLBACK25-NEXT: sarl %cl, %ebx
+; FALLBACK25-NEXT: movl %ebx, 28(%edi)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%edi)
+; FALLBACK25-NEXT: movl %edx, 20(%edi)
+; FALLBACK25-NEXT: movl %ebp, 8(%edi)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%edi)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, (%edi)
+; FALLBACK25-NEXT: addl $108, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: ashr_32bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $108, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: movl 24(%ecx), %edx
+; FALLBACK26-NEXT: movl 20(%ecx), %esi
+; FALLBACK26-NEXT: movl 28(%ecx), %edi
+; FALLBACK26-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT: movl 16(%ecx), %ebx
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %eax
+; FALLBACK26-NEXT: shlb $3, %al
+; FALLBACK26-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: sarl $31, %edi
+; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: andb $28, %cl
+; FALLBACK26-NEXT: movzbl %cl, %edi
+; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %eax, %edx
+; FALLBACK26-NEXT: movl %eax, %ebx
+; FALLBACK26-NEXT: notb %dl
+; FALLBACK26-NEXT: movl 36(%esp,%edi), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %esi
+; FALLBACK26-NEXT: shlxl %edx, %esi, %eax
+; FALLBACK26-NEXT: orl %ecx, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 48(%esp,%edi), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %eax
+; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %ebx, %esi
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ebx
+; FALLBACK26-NEXT: orl %ebx, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %eax
+; FALLBACK26-NEXT: movl 40(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %esi, %ecx, %ebx
+; FALLBACK26-NEXT: movl %esi, %ecx
+; FALLBACK26-NEXT: orl %ebx, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 56(%esp,%edi), %ebx
+; FALLBACK26-NEXT: leal (%ebx,%ebx), %ebp
+; FALLBACK26-NEXT: shlxl %edx, %ebp, %ebp
+; FALLBACK26-NEXT: movl 52(%esp,%edi), %eax
+; FALLBACK26-NEXT: shrxl %esi, %eax, %esi
+; FALLBACK26-NEXT: orl %esi, %ebp
+; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %eax, %eax
+; FALLBACK26-NEXT: shlxl %edx, %eax, %esi
+; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrxl %ecx, %ebx, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK26-NEXT: sarxl %ecx, %edi, %ebx
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %eax, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl %ebx, 28(%eax)
+; FALLBACK26-NEXT: movl %ecx, 4(%eax)
+; FALLBACK26-NEXT: movl %edi, 24(%eax)
+; FALLBACK26-NEXT: movl %esi, 16(%eax)
+; FALLBACK26-NEXT: movl %ebp, 20(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 8(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 12(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, (%eax)
+; FALLBACK26-NEXT: addl $108, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: ashr_32bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $108, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT: movl 24(%edx), %eax
+; FALLBACK27-NEXT: movl 16(%edx), %esi
+; FALLBACK27-NEXT: movl 28(%edx), %edi
+; FALLBACK27-NEXT: vmovups (%edx), %xmm0
+; FALLBACK27-NEXT: movl 20(%edx), %ebx
+; FALLBACK27-NEXT: movzbl (%ecx), %edx
+; FALLBACK27-NEXT: movl %edx, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: sarl $31, %edi
+; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: andb $28, %dl
+; FALLBACK27-NEXT: movzbl %dl, %eax
+; FALLBACK27-NEXT: movl 36(%esp,%eax), %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 32(%esp,%eax), %edi
+; FALLBACK27-NEXT: movl 48(%esp,%eax), %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 44(%esp,%eax), %edi
+; FALLBACK27-NEXT: movl %edi, %ebx
+; FALLBACK27-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 52(%esp,%eax), %esi
+; FALLBACK27-NEXT: movl 60(%esp,%eax), %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 40(%esp,%eax), %ebx
+; FALLBACK27-NEXT: movl 56(%esp,%eax), %ebp
+; FALLBACK27-NEXT: movl %ebx, %eax
+; FALLBACK27-NEXT: shrdl %cl, %edi, %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl %esi, %eax
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK27-NEXT: sarxl %ecx, %eax, %edi
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %ebx, %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: movl %eax, 4(%ecx)
+; FALLBACK27-NEXT: movl %ebp, 24(%ecx)
+; FALLBACK27-NEXT: movl %edi, 28(%ecx)
+; FALLBACK27-NEXT: movl %edx, 16(%ecx)
+; FALLBACK27-NEXT: movl %esi, 20(%ecx)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 8(%ecx)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 12(%ecx)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, (%ecx)
+; FALLBACK27-NEXT: addl $108, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: ashr_32bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $108, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edi
+; FALLBACK28-NEXT: movl 24(%edi), %ecx
+; FALLBACK28-NEXT: movl 20(%edi), %eax
+; FALLBACK28-NEXT: movl 28(%edi), %esi
+; FALLBACK28-NEXT: vmovups (%edi), %xmm0
+; FALLBACK28-NEXT: movl 16(%edi), %edi
+; FALLBACK28-NEXT: movzbl (%edx), %edx
+; FALLBACK28-NEXT: movb %dl, %dh
+; FALLBACK28-NEXT: shlb $3, %dh
+; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: sarl $31, %esi
+; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $28, %dl
+; FALLBACK28-NEXT: movzbl %dl, %ebx
+; FALLBACK28-NEXT: movl 32(%esp,%ebx), %eax
+; FALLBACK28-NEXT: movl 48(%esp,%ebx), %ebp
+; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: movb %dh, %dl
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: movl 36(%esp,%ebx), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %eax, %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 44(%esp,%ebx), %edi
+; FALLBACK28-NEXT: movl %edi, %eax
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: leal (%ebp,%ebp), %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %eax, %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 40(%esp,%ebx), %esi
+; FALLBACK28-NEXT: movl %esi, %eax
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: addl %edi, %edi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl %eax, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 52(%esp,%ebx), %ebp
+; FALLBACK28-NEXT: movl %ebp, %eax
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: movl 56(%esp,%ebx), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl %eax, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: addl %ebp, %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %eax, %ebp
+; FALLBACK28-NEXT: movl 60(%esp,%ebx), %ebx
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: leal (%ebx,%ebx), %eax
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %edi, %eax
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %edi, %esi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: sarl %cl, %ebx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: movl %ebx, 28(%ecx)
+; FALLBACK28-NEXT: movl %esi, 4(%ecx)
+; FALLBACK28-NEXT: movl %eax, 24(%ecx)
+; FALLBACK28-NEXT: movl %ebp, 16(%ecx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, 20(%ecx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, 8(%ecx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, 12(%ecx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, (%ecx)
+; FALLBACK28-NEXT: addl $108, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: ashr_32bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $108, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT: movl 24(%edx), %eax
+; FALLBACK29-NEXT: movl 16(%edx), %esi
+; FALLBACK29-NEXT: movl 28(%edx), %edi
+; FALLBACK29-NEXT: vmovups (%edx), %xmm0
+; FALLBACK29-NEXT: movl 20(%edx), %ebx
+; FALLBACK29-NEXT: movzbl (%ecx), %edx
+; FALLBACK29-NEXT: movl %edx, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: sarl $31, %edi
+; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: andb $28, %dl
+; FALLBACK29-NEXT: movzbl %dl, %eax
+; FALLBACK29-NEXT: movl 36(%esp,%eax), %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 32(%esp,%eax), %edx
+; FALLBACK29-NEXT: movl 48(%esp,%eax), %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 44(%esp,%eax), %esi
+; FALLBACK29-NEXT: movl %esi, %ebx
+; FALLBACK29-NEXT: shrdl %cl, %edi, %ebx
+; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 52(%esp,%eax), %edi
+; FALLBACK29-NEXT: movl 60(%esp,%eax), %ebx
+; FALLBACK29-NEXT: movl 40(%esp,%eax), %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 56(%esp,%eax), %eax
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, %ebp
+; FALLBACK29-NEXT: movl %edi, %edx
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK29-NEXT: shrdl %cl, %ebx, %eax
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK29-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edi
+; FALLBACK29-NEXT: movl %esi, 4(%edi)
+; FALLBACK29-NEXT: movl %eax, 24(%edi)
+; FALLBACK29-NEXT: sarl %cl, %ebx
+; FALLBACK29-NEXT: movl %ebx, 28(%edi)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%edi)
+; FALLBACK29-NEXT: movl %edx, 20(%edi)
+; FALLBACK29-NEXT: movl %ebp, 8(%edi)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%edi)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, (%edi)
+; FALLBACK29-NEXT: addl $108, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: ashr_32bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $108, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: movl 24(%ecx), %edx
+; FALLBACK30-NEXT: movl 20(%ecx), %esi
+; FALLBACK30-NEXT: movl 28(%ecx), %edi
+; FALLBACK30-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT: movl 16(%ecx), %ebx
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %eax
+; FALLBACK30-NEXT: shlb $3, %al
+; FALLBACK30-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: sarl $31, %edi
+; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: andb $28, %cl
+; FALLBACK30-NEXT: movzbl %cl, %edi
+; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %eax, %edx
+; FALLBACK30-NEXT: movl %eax, %ebx
+; FALLBACK30-NEXT: notb %dl
+; FALLBACK30-NEXT: movl 36(%esp,%edi), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %esi
+; FALLBACK30-NEXT: shlxl %edx, %esi, %eax
+; FALLBACK30-NEXT: orl %ecx, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 48(%esp,%edi), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %eax
+; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %ebx, %esi
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ebx
+; FALLBACK30-NEXT: orl %ebx, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %eax
+; FALLBACK30-NEXT: movl 40(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %esi, %ecx, %ebx
+; FALLBACK30-NEXT: movl %esi, %ecx
+; FALLBACK30-NEXT: orl %ebx, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 56(%esp,%edi), %ebx
+; FALLBACK30-NEXT: leal (%ebx,%ebx), %ebp
+; FALLBACK30-NEXT: shlxl %edx, %ebp, %ebp
+; FALLBACK30-NEXT: movl 52(%esp,%edi), %eax
+; FALLBACK30-NEXT: shrxl %esi, %eax, %esi
+; FALLBACK30-NEXT: orl %esi, %ebp
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %eax, %eax
+; FALLBACK30-NEXT: shlxl %edx, %eax, %esi
+; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrxl %ecx, %ebx, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK30-NEXT: sarxl %ecx, %edi, %ebx
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %eax, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl %ebx, 28(%eax)
+; FALLBACK30-NEXT: movl %ecx, 4(%eax)
+; FALLBACK30-NEXT: movl %edi, 24(%eax)
+; FALLBACK30-NEXT: movl %esi, 16(%eax)
+; FALLBACK30-NEXT: movl %ebp, 20(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 8(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 12(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, (%eax)
+; FALLBACK30-NEXT: addl $108, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: ashr_32bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $108, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT: movl 24(%edx), %eax
+; FALLBACK31-NEXT: movl 16(%edx), %esi
+; FALLBACK31-NEXT: movl 28(%edx), %edi
+; FALLBACK31-NEXT: vmovups (%edx), %xmm0
+; FALLBACK31-NEXT: movl 20(%edx), %ebx
+; FALLBACK31-NEXT: movzbl (%ecx), %edx
+; FALLBACK31-NEXT: movl %edx, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: sarl $31, %edi
+; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: andb $28, %dl
+; FALLBACK31-NEXT: movzbl %dl, %eax
+; FALLBACK31-NEXT: movl 36(%esp,%eax), %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 32(%esp,%eax), %edi
+; FALLBACK31-NEXT: movl 48(%esp,%eax), %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 44(%esp,%eax), %edi
+; FALLBACK31-NEXT: movl %edi, %ebx
+; FALLBACK31-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 52(%esp,%eax), %esi
+; FALLBACK31-NEXT: movl 60(%esp,%eax), %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 40(%esp,%eax), %ebx
+; FALLBACK31-NEXT: movl 56(%esp,%eax), %ebp
+; FALLBACK31-NEXT: movl %ebx, %eax
+; FALLBACK31-NEXT: shrdl %cl, %edi, %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl %esi, %eax
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK31-NEXT: sarxl %ecx, %eax, %edi
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %ebx, %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: movl %eax, 4(%ecx)
+; FALLBACK31-NEXT: movl %ebp, 24(%ecx)
+; FALLBACK31-NEXT: movl %edi, 28(%ecx)
+; FALLBACK31-NEXT: movl %edx, 16(%ecx)
+; FALLBACK31-NEXT: movl %esi, 20(%ecx)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 8(%ecx)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 12(%ecx)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, (%ecx)
+; FALLBACK31-NEXT: addl $108, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: retl
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
%bitOff = shl i256 %byteOff, 3
@@ -1655,343 +8734,3643 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
}
define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: lshr_64bytes:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: pushq %rbx
-; X64-SSE2-NEXT: movq (%rdi), %rax
-; X64-SSE2-NEXT: movq 8(%rdi), %rcx
-; X64-SSE2-NEXT: movq 16(%rdi), %r8
-; X64-SSE2-NEXT: movq 24(%rdi), %r9
-; X64-SSE2-NEXT: movq 32(%rdi), %r10
-; X64-SSE2-NEXT: movq 40(%rdi), %r11
-; X64-SSE2-NEXT: movq 48(%rdi), %rbx
-; X64-SSE2-NEXT: movq 56(%rdi), %rdi
-; X64-SSE2-NEXT: movl (%rsi), %esi
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $63, %esi
-; X64-SSE2-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-SSE2-NEXT: movq -120(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %r8
-; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r9
-; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r10
-; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %r11
-; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %rsi
-; X64-SSE2-NEXT: movq %rsi, 48(%rdx)
-; X64-SSE2-NEXT: movq %r11, 56(%rdx)
-; X64-SSE2-NEXT: movq %r10, 32(%rdx)
-; X64-SSE2-NEXT: movq %r9, 40(%rdx)
-; X64-SSE2-NEXT: movq %r8, 16(%rdx)
-; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT: movq %rax, (%rdx)
-; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
-; X64-SSE2-NEXT: popq %rbx
-; X64-SSE2-NEXT: retq
-;
-; X64-SSE42-LABEL: lshr_64bytes:
-; X64-SSE42: # %bb.0:
-; X64-SSE42-NEXT: movups (%rdi), %xmm0
-; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
-; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
-; X64-SSE42-NEXT: movups 48(%rdi), %xmm3
-; X64-SSE42-NEXT: movl (%rsi), %eax
-; X64-SSE42-NEXT: xorps %xmm4, %xmm4
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm3, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $63, %eax
-; X64-SSE42-NEXT: movups -128(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT: movups -112(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT: movups -96(%rsp,%rax), %xmm2
-; X64-SSE42-NEXT: movups -80(%rsp,%rax), %xmm3
-; X64-SSE42-NEXT: movups %xmm3, 48(%rdx)
-; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
-; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
-; X64-SSE42-NEXT: movups %xmm0, (%rdx)
-; X64-SSE42-NEXT: retq
-;
-; X64-AVX1-LABEL: lshr_64bytes:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT: movl (%rsi), %eax
-; X64-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: andl $63, %eax
-; X64-AVX1-NEXT: vmovups -128(%rsp,%rax), %xmm0
-; X64-AVX1-NEXT: vmovups -112(%rsp,%rax), %xmm1
-; X64-AVX1-NEXT: vmovups -96(%rsp,%rax), %xmm2
-; X64-AVX1-NEXT: vmovups -80(%rsp,%rax), %xmm3
-; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx)
-; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx)
-; X64-AVX1-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX1-NEXT: vzeroupper
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX512-LABEL: lshr_64bytes:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-AVX512-NEXT: movl (%rsi), %eax
-; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: andl $63, %eax
-; X64-AVX512-NEXT: vmovups -128(%rsp,%rax), %xmm0
-; X64-AVX512-NEXT: vmovups -112(%rsp,%rax), %xmm1
-; X64-AVX512-NEXT: vmovups -96(%rsp,%rax), %xmm2
-; X64-AVX512-NEXT: vmovups -80(%rsp,%rax), %xmm3
-; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX512-NEXT: vzeroupper
-; X64-AVX512-NEXT: retq
-;
-; X86-SSE2-LABEL: lshr_64bytes:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: pushl %ebx
-; X86-SSE2-NEXT: pushl %edi
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $168, %esp
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl (%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 4(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 8(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 12(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 16(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 20(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 24(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 28(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 32(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 36(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 40(%eax), %ebp
-; X86-SSE2-NEXT: movl 44(%eax), %ebx
-; X86-SSE2-NEXT: movl 48(%eax), %edi
-; X86-SSE2-NEXT: movl 52(%eax), %esi
-; X86-SSE2-NEXT: movl 56(%eax), %edx
-; X86-SSE2-NEXT: movl 60(%eax), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl (%eax), %eax
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $63, %eax
-; X86-SSE2-NEXT: movl 40(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 48(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 56(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 72(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebp
-; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebx
-; X86-SSE2-NEXT: movl 92(%esp,%eax), %edi
-; X86-SSE2-NEXT: movl 88(%esp,%eax), %esi
-; X86-SSE2-NEXT: movl 100(%esp,%eax), %edx
-; X86-SSE2-NEXT: movl 96(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl %ecx, 56(%eax)
-; X86-SSE2-NEXT: movl %edx, 60(%eax)
-; X86-SSE2-NEXT: movl %esi, 48(%eax)
-; X86-SSE2-NEXT: movl %edi, 52(%eax)
-; X86-SSE2-NEXT: movl %ebx, 40(%eax)
-; X86-SSE2-NEXT: movl %ebp, 44(%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 32(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 36(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 24(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 28(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 16(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 20(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 8(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 12(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, (%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $168, %esp
-; X86-SSE2-NEXT: popl %esi
-; X86-SSE2-NEXT: popl %edi
-; X86-SSE2-NEXT: popl %ebx
-; X86-SSE2-NEXT: popl %ebp
-; X86-SSE2-NEXT: retl
-;
-; X86-SSE42-LABEL: lshr_64bytes:
-; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $128, %esp
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT: movups (%edx), %xmm0
-; X86-SSE42-NEXT: movups 16(%edx), %xmm1
-; X86-SSE42-NEXT: movups 32(%edx), %xmm2
-; X86-SSE42-NEXT: movups 48(%edx), %xmm3
-; X86-SSE42-NEXT: movl (%ecx), %ecx
-; X86-SSE42-NEXT: xorps %xmm4, %xmm4
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm3, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
-; X86-SSE42-NEXT: andl $63, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm2
-; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm3
-; X86-SSE42-NEXT: movups %xmm3, 48(%eax)
-; X86-SSE42-NEXT: movups %xmm2, 32(%eax)
-; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
-; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $128, %esp
-; X86-SSE42-NEXT: retl
-;
-; X86-AVX1-LABEL: lshr_64bytes:
-; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: subl $128, %esp
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT: vmovups (%edx), %ymm0
-; X86-AVX1-NEXT: vmovups 32(%edx), %ymm1
-; X86-AVX1-NEXT: movl (%ecx), %ecx
-; X86-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X86-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovups %ymm0, (%esp)
-; X86-AVX1-NEXT: andl $63, %ecx
-; X86-AVX1-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX1-NEXT: vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX1-NEXT: vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX1-NEXT: vmovups 48(%esp,%ecx), %xmm3
-; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax)
-; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax)
-; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax)
-; X86-AVX1-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX1-NEXT: addl $128, %esp
-; X86-AVX1-NEXT: vzeroupper
-; X86-AVX1-NEXT: retl
-;
-; X86-AVX512-LABEL: lshr_64bytes:
-; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: subl $128, %esp
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT: vmovups (%edx), %zmm0
-; X86-AVX512-NEXT: movl (%ecx), %ecx
-; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-AVX512-NEXT: vmovups %zmm0, (%esp)
-; X86-AVX512-NEXT: andl $63, %ecx
-; X86-AVX512-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX512-NEXT: vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX512-NEXT: vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX512-NEXT: vmovups 48(%esp,%ecx), %xmm3
-; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax)
-; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax)
-; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax)
-; X86-AVX512-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT: addl $128, %esp
-; X86-AVX512-NEXT: vzeroupper
-; X86-AVX512-NEXT: retl
+; FALLBACK0-LABEL: lshr_64bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %r15
+; FALLBACK0-NEXT: pushq %r14
+; FALLBACK0-NEXT: pushq %r13
+; FALLBACK0-NEXT: pushq %r12
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq 16(%rdi), %rax
+; FALLBACK0-NEXT: movq 32(%rdi), %rcx
+; FALLBACK0-NEXT: movq 48(%rdi), %r8
+; FALLBACK0-NEXT: movq (%rdi), %r9
+; FALLBACK0-NEXT: movq 8(%rdi), %r10
+; FALLBACK0-NEXT: movq 24(%rdi), %r11
+; FALLBACK0-NEXT: movq 40(%rdi), %rbx
+; FALLBACK0-NEXT: movq 56(%rdi), %r14
+; FALLBACK0-NEXT: movl (%rsi), %edi
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: leal (,%rdi,8), %eax
+; FALLBACK0-NEXT: andl $56, %eax
+; FALLBACK0-NEXT: andl $56, %edi
+; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8
+; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r9
+; FALLBACK0-NEXT: movq %r8, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %r14
+; FALLBACK0-NEXT: leaq (%r14,%r14), %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %r11, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: addq %r8, %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: orq %rbx, %r8
+; FALLBACK0-NEXT: movq %r9, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r15
+; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r11
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: orq %r15, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r14
+; FALLBACK0-NEXT: addq %r9, %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r9
+; FALLBACK0-NEXT: orq %r14, %r9
+; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %r14
+; FALLBACK0-NEXT: movq %r14, %r12
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r12
+; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13
+; FALLBACK0-NEXT: leaq (%r13,%r13), %r15
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r15
+; FALLBACK0-NEXT: orq %r12, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: addq %r14, %r14
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r14
+; FALLBACK0-NEXT: orq %rbx, %r14
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r13
+; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK0-NEXT: leaq (%rdi,%rdi), %rbx
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rbx
+; FALLBACK0-NEXT: orq %r13, %rbx
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rdi
+; FALLBACK0-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK0-NEXT: movq %rbx, 48(%rdx)
+; FALLBACK0-NEXT: movq %r14, 32(%rdx)
+; FALLBACK0-NEXT: movq %r15, 40(%rdx)
+; FALLBACK0-NEXT: movq %r9, 16(%rdx)
+; FALLBACK0-NEXT: movq %r11, 24(%rdx)
+; FALLBACK0-NEXT: movq %r8, (%rdx)
+; FALLBACK0-NEXT: movq %r10, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: popq %r12
+; FALLBACK0-NEXT: popq %r13
+; FALLBACK0-NEXT: popq %r14
+; FALLBACK0-NEXT: popq %r15
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: lshr_64bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: pushq %r14
+; FALLBACK1-NEXT: pushq %rbx
+; FALLBACK1-NEXT: pushq %rax
+; FALLBACK1-NEXT: movq 24(%rdi), %rcx
+; FALLBACK1-NEXT: movq 40(%rdi), %r8
+; FALLBACK1-NEXT: movq 56(%rdi), %r9
+; FALLBACK1-NEXT: movq (%rdi), %r10
+; FALLBACK1-NEXT: movq 8(%rdi), %r11
+; FALLBACK1-NEXT: movq 16(%rdi), %rbx
+; FALLBACK1-NEXT: movq 32(%rdi), %r14
+; FALLBACK1-NEXT: movq 48(%rdi), %rdi
+; FALLBACK1-NEXT: movl (%rsi), %eax
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: leal (,%rax,8), %ecx
+; FALLBACK1-NEXT: andl $56, %ecx
+; FALLBACK1-NEXT: andl $56, %eax
+; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9
+; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r10
+; FALLBACK1-NEXT: movq %r9, %r8
+; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8
+; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi
+; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r9
+; FALLBACK1-NEXT: movq %r10, %r11
+; FALLBACK1-NEXT: shrdq %cl, %r9, %r11
+; FALLBACK1-NEXT: shrdq %cl, %r10, %rdi
+; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r10
+; FALLBACK1-NEXT: movq -88(%rsp,%rax), %rbx
+; FALLBACK1-NEXT: movq %rbx, %r14
+; FALLBACK1-NEXT: shrdq %cl, %r10, %r14
+; FALLBACK1-NEXT: shrdq %cl, %rbx, %r9
+; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK1-NEXT: shrdq %cl, %rax, %r10
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: shrq %cl, %rax
+; FALLBACK1-NEXT: movq %r10, 48(%rdx)
+; FALLBACK1-NEXT: movq %rax, 56(%rdx)
+; FALLBACK1-NEXT: movq %r9, 32(%rdx)
+; FALLBACK1-NEXT: movq %r14, 40(%rdx)
+; FALLBACK1-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT: movq %r11, 24(%rdx)
+; FALLBACK1-NEXT: movq %rsi, (%rdx)
+; FALLBACK1-NEXT: movq %r8, 8(%rdx)
+; FALLBACK1-NEXT: addq $8, %rsp
+; FALLBACK1-NEXT: popq %rbx
+; FALLBACK1-NEXT: popq %r14
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: lshr_64bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: pushq %rbp
+; FALLBACK2-NEXT: pushq %r15
+; FALLBACK2-NEXT: pushq %r14
+; FALLBACK2-NEXT: pushq %r13
+; FALLBACK2-NEXT: pushq %r12
+; FALLBACK2-NEXT: pushq %rbx
+; FALLBACK2-NEXT: pushq %rax
+; FALLBACK2-NEXT: movq 16(%rdi), %rcx
+; FALLBACK2-NEXT: movq 32(%rdi), %r8
+; FALLBACK2-NEXT: movq 48(%rdi), %r9
+; FALLBACK2-NEXT: movq (%rdi), %r10
+; FALLBACK2-NEXT: movq 8(%rdi), %r11
+; FALLBACK2-NEXT: movq 24(%rdi), %rbx
+; FALLBACK2-NEXT: movq 40(%rdi), %r14
+; FALLBACK2-NEXT: movq 56(%rdi), %rdi
+; FALLBACK2-NEXT: movl (%rsi), %eax
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: leal (,%rax,8), %ecx
+; FALLBACK2-NEXT: andl $56, %ecx
+; FALLBACK2-NEXT: andl $56, %eax
+; FALLBACK2-NEXT: movq -120(%rsp,%rax), %r8
+; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi
+; FALLBACK2-NEXT: shrxq %rcx, %r8, %rbx
+; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK2-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbp
+; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r9
+; FALLBACK2-NEXT: shrxq %rcx, %r10, %r11
+; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15
+; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r13
+; FALLBACK2-NEXT: movl %ecx, %r12d
+; FALLBACK2-NEXT: notb %r12b
+; FALLBACK2-NEXT: addq %r10, %r10
+; FALLBACK2-NEXT: shlxq %r12, %r10, %r10
+; FALLBACK2-NEXT: orq %rbx, %r10
+; FALLBACK2-NEXT: addq %r8, %r8
+; FALLBACK2-NEXT: shlxq %r12, %r8, %r8
+; FALLBACK2-NEXT: orq %rbp, %r8
+; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx
+; FALLBACK2-NEXT: shrxq %rcx, %rbx, %rbp
+; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK2-NEXT: shrxq %rcx, %rax, %rcx
+; FALLBACK2-NEXT: addq %rdi, %rdi
+; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r9, %rdi
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r11, %rsi
+; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r9
+; FALLBACK2-NEXT: shlxq %r12, %r9, %r9
+; FALLBACK2-NEXT: orq %r15, %r9
+; FALLBACK2-NEXT: addq %r14, %r14
+; FALLBACK2-NEXT: shlxq %r12, %r14, %r11
+; FALLBACK2-NEXT: orq %r13, %r11
+; FALLBACK2-NEXT: addq %rax, %rax
+; FALLBACK2-NEXT: shlxq %r12, %rax, %rax
+; FALLBACK2-NEXT: orq %rbp, %rax
+; FALLBACK2-NEXT: movq %rcx, 56(%rdx)
+; FALLBACK2-NEXT: movq %rax, 48(%rdx)
+; FALLBACK2-NEXT: movq %r11, 32(%rdx)
+; FALLBACK2-NEXT: movq %r9, 40(%rdx)
+; FALLBACK2-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK2-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK2-NEXT: movq %r8, (%rdx)
+; FALLBACK2-NEXT: movq %r10, 8(%rdx)
+; FALLBACK2-NEXT: addq $8, %rsp
+; FALLBACK2-NEXT: popq %rbx
+; FALLBACK2-NEXT: popq %r12
+; FALLBACK2-NEXT: popq %r13
+; FALLBACK2-NEXT: popq %r14
+; FALLBACK2-NEXT: popq %r15
+; FALLBACK2-NEXT: popq %rbp
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: lshr_64bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: pushq %r14
+; FALLBACK3-NEXT: pushq %rbx
+; FALLBACK3-NEXT: pushq %rax
+; FALLBACK3-NEXT: movq 24(%rdi), %rcx
+; FALLBACK3-NEXT: movq 40(%rdi), %r8
+; FALLBACK3-NEXT: movq 56(%rdi), %r9
+; FALLBACK3-NEXT: movq (%rdi), %r10
+; FALLBACK3-NEXT: movq 8(%rdi), %r11
+; FALLBACK3-NEXT: movq 16(%rdi), %rbx
+; FALLBACK3-NEXT: movq 32(%rdi), %r14
+; FALLBACK3-NEXT: movq 48(%rdi), %rdi
+; FALLBACK3-NEXT: movl (%rsi), %eax
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: leal (,%rax,8), %ecx
+; FALLBACK3-NEXT: andl $56, %ecx
+; FALLBACK3-NEXT: andl $56, %eax
+; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9
+; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r10
+; FALLBACK3-NEXT: movq %r9, %r8
+; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8
+; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi
+; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r9
+; FALLBACK3-NEXT: movq %r10, %r11
+; FALLBACK3-NEXT: shrdq %cl, %r9, %r11
+; FALLBACK3-NEXT: shrdq %cl, %r10, %rdi
+; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r10
+; FALLBACK3-NEXT: movq -88(%rsp,%rax), %rbx
+; FALLBACK3-NEXT: movq %rbx, %r14
+; FALLBACK3-NEXT: shrdq %cl, %r10, %r14
+; FALLBACK3-NEXT: shrdq %cl, %rbx, %r9
+; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK3-NEXT: shrdq %cl, %rax, %r10
+; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: movq %r10, 48(%rdx)
+; FALLBACK3-NEXT: movq %r9, 32(%rdx)
+; FALLBACK3-NEXT: movq %r14, 40(%rdx)
+; FALLBACK3-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT: movq %r11, 24(%rdx)
+; FALLBACK3-NEXT: movq %rsi, (%rdx)
+; FALLBACK3-NEXT: movq %r8, 8(%rdx)
+; FALLBACK3-NEXT: movq %rax, 56(%rdx)
+; FALLBACK3-NEXT: addq $8, %rsp
+; FALLBACK3-NEXT: popq %rbx
+; FALLBACK3-NEXT: popq %r14
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: lshr_64bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbp
+; FALLBACK4-NEXT: pushq %r15
+; FALLBACK4-NEXT: pushq %r14
+; FALLBACK4-NEXT: pushq %r13
+; FALLBACK4-NEXT: pushq %r12
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: pushq %rax
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK4-NEXT: movl (%rsi), %r8d
+; FALLBACK4-NEXT: xorps %xmm4, %xmm4
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: leal (,%r8,8), %eax
+; FALLBACK4-NEXT: andl $56, %eax
+; FALLBACK4-NEXT: andl $56, %r8d
+; FALLBACK4-NEXT: movq -128(%rsp,%r8), %r10
+; FALLBACK4-NEXT: movq -120(%rsp,%r8), %r9
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: leaq (%r9,%r9), %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rdi
+; FALLBACK4-NEXT: orq %r10, %rdi
+; FALLBACK4-NEXT: movq -104(%rsp,%r8), %r10
+; FALLBACK4-NEXT: movq %r10, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbx
+; FALLBACK4-NEXT: movq -96(%rsp,%r8), %r12
+; FALLBACK4-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r11
+; FALLBACK4-NEXT: orq %rbx, %r11
+; FALLBACK4-NEXT: movq -112(%rsp,%r8), %rbx
+; FALLBACK4-NEXT: movq %rbx, %r14
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r14
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r14, %r10
+; FALLBACK4-NEXT: movq -88(%rsp,%r8), %r14
+; FALLBACK4-NEXT: movq %r14, %r13
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r13
+; FALLBACK4-NEXT: movq -80(%rsp,%r8), %rbp
+; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r15
+; FALLBACK4-NEXT: orq %r13, %r15
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r12
+; FALLBACK4-NEXT: addq %r14, %r14
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r14
+; FALLBACK4-NEXT: orq %r12, %r14
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbp
+; FALLBACK4-NEXT: movq -72(%rsp,%r8), %r8
+; FALLBACK4-NEXT: leaq (%r8,%r8), %r12
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r12
+; FALLBACK4-NEXT: orq %rbp, %r12
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r9
+; FALLBACK4-NEXT: addq %rbx, %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r9, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: movq %r8, 56(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK4-NEXT: movq %r12, 48(%rdx)
+; FALLBACK4-NEXT: movq %r14, 32(%rdx)
+; FALLBACK4-NEXT: movq %r15, 40(%rdx)
+; FALLBACK4-NEXT: movq %r10, 16(%rdx)
+; FALLBACK4-NEXT: movq %r11, 24(%rdx)
+; FALLBACK4-NEXT: movq %rdi, (%rdx)
+; FALLBACK4-NEXT: addq $8, %rsp
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: popq %r12
+; FALLBACK4-NEXT: popq %r13
+; FALLBACK4-NEXT: popq %r14
+; FALLBACK4-NEXT: popq %r15
+; FALLBACK4-NEXT: popq %rbp
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: lshr_64bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: pushq %r15
+; FALLBACK5-NEXT: pushq %r14
+; FALLBACK5-NEXT: pushq %rbx
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK5-NEXT: movl (%rsi), %eax
+; FALLBACK5-NEXT: xorps %xmm4, %xmm4
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: andl $56, %ecx
+; FALLBACK5-NEXT: andl $56, %eax
+; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq %r9, %rsi
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK5-NEXT: movq %r10, %r8
+; FALLBACK5-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK5-NEXT: movq %r11, %rbx
+; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK5-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK5-NEXT: movq %rax, %r15
+; FALLBACK5-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: shrq %cl, %r11
+; FALLBACK5-NEXT: movq %r15, 8(%rdx)
+; FALLBACK5-NEXT: movq %r9, 48(%rdx)
+; FALLBACK5-NEXT: movq %r11, 56(%rdx)
+; FALLBACK5-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK5-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK5-NEXT: movq %r8, 16(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r14, (%rdx)
+; FALLBACK5-NEXT: popq %rbx
+; FALLBACK5-NEXT: popq %r14
+; FALLBACK5-NEXT: popq %r15
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: lshr_64bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: pushq %rbp
+; FALLBACK6-NEXT: pushq %r15
+; FALLBACK6-NEXT: pushq %r14
+; FALLBACK6-NEXT: pushq %r13
+; FALLBACK6-NEXT: pushq %r12
+; FALLBACK6-NEXT: pushq %rbx
+; FALLBACK6-NEXT: pushq %rax
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK6-NEXT: movl (%rsi), %eax
+; FALLBACK6-NEXT: xorps %xmm4, %xmm4
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: leal (,%rax,8), %esi
+; FALLBACK6-NEXT: andl $56, %esi
+; FALLBACK6-NEXT: andl $56, %eax
+; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx
+; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12
+; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9
+; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14
+; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15
+; FALLBACK6-NEXT: movl %esi, %ebx
+; FALLBACK6-NEXT: notb %bl
+; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK6-NEXT: orq %r11, %r8
+; FALLBACK6-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK6-NEXT: orq %r12, %r11
+; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13
+; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp
+; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK6-NEXT: shrxq %rsi, %rax, %rsi
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK6-NEXT: orq %r9, %rdi
+; FALLBACK6-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK6-NEXT: orq %r14, %r9
+; FALLBACK6-NEXT: addq %r10, %r10
+; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK6-NEXT: orq %r15, %r10
+; FALLBACK6-NEXT: addq %rax, %rax
+; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK6-NEXT: orq %r13, %rax
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx
+; FALLBACK6-NEXT: orq %rbp, %rcx
+; FALLBACK6-NEXT: movq %rsi, 56(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 8(%rdx)
+; FALLBACK6-NEXT: movq %rax, 48(%rdx)
+; FALLBACK6-NEXT: movq %r10, 32(%rdx)
+; FALLBACK6-NEXT: movq %r9, 40(%rdx)
+; FALLBACK6-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %r8, (%rdx)
+; FALLBACK6-NEXT: addq $8, %rsp
+; FALLBACK6-NEXT: popq %rbx
+; FALLBACK6-NEXT: popq %r12
+; FALLBACK6-NEXT: popq %r13
+; FALLBACK6-NEXT: popq %r14
+; FALLBACK6-NEXT: popq %r15
+; FALLBACK6-NEXT: popq %rbp
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: lshr_64bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: pushq %r15
+; FALLBACK7-NEXT: pushq %r14
+; FALLBACK7-NEXT: pushq %rbx
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK7-NEXT: movl (%rsi), %eax
+; FALLBACK7-NEXT: xorps %xmm4, %xmm4
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: andl $56, %ecx
+; FALLBACK7-NEXT: andl $56, %eax
+; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq %r9, %rsi
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK7-NEXT: movq %r10, %r8
+; FALLBACK7-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK7-NEXT: movq %r11, %rbx
+; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK7-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK7-NEXT: movq %rax, %r15
+; FALLBACK7-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK7-NEXT: shrxq %rcx, %r11, %r10
+; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK7-NEXT: movq %r15, 8(%rdx)
+; FALLBACK7-NEXT: movq %r9, 48(%rdx)
+; FALLBACK7-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK7-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK7-NEXT: movq %r8, 16(%rdx)
+; FALLBACK7-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK7-NEXT: movq %r14, (%rdx)
+; FALLBACK7-NEXT: movq %r10, 56(%rdx)
+; FALLBACK7-NEXT: popq %rbx
+; FALLBACK7-NEXT: popq %r14
+; FALLBACK7-NEXT: popq %r15
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: lshr_64bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbp
+; FALLBACK8-NEXT: pushq %r15
+; FALLBACK8-NEXT: pushq %r14
+; FALLBACK8-NEXT: pushq %r13
+; FALLBACK8-NEXT: pushq %r12
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: pushq %rax
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK8-NEXT: movl (%rsi), %r9d
+; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: leal (,%r9,8), %eax
+; FALLBACK8-NEXT: andl $56, %eax
+; FALLBACK8-NEXT: andl $56, %r9d
+; FALLBACK8-NEXT: movq -128(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq -120(%rsp,%r9), %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rdi
+; FALLBACK8-NEXT: orq %r10, %rdi
+; FALLBACK8-NEXT: movq -104(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq %r10, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbx
+; FALLBACK8-NEXT: movq -96(%rsp,%r9), %r12
+; FALLBACK8-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r11
+; FALLBACK8-NEXT: orq %rbx, %r11
+; FALLBACK8-NEXT: movq -112(%rsp,%r9), %rbx
+; FALLBACK8-NEXT: movq %rbx, %r14
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r14
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r14, %r10
+; FALLBACK8-NEXT: movq -88(%rsp,%r9), %r14
+; FALLBACK8-NEXT: movq %r14, %r13
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r13
+; FALLBACK8-NEXT: movq -80(%rsp,%r9), %rbp
+; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r15
+; FALLBACK8-NEXT: orq %r13, %r15
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r12
+; FALLBACK8-NEXT: addq %r14, %r14
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r14
+; FALLBACK8-NEXT: orq %r12, %r14
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbp
+; FALLBACK8-NEXT: movq -72(%rsp,%r9), %r9
+; FALLBACK8-NEXT: leaq (%r9,%r9), %r12
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r12
+; FALLBACK8-NEXT: orq %rbp, %r12
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: addq %rbx, %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r8, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, 56(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK8-NEXT: movq %r12, 48(%rdx)
+; FALLBACK8-NEXT: movq %r14, 32(%rdx)
+; FALLBACK8-NEXT: movq %r15, 40(%rdx)
+; FALLBACK8-NEXT: movq %r10, 16(%rdx)
+; FALLBACK8-NEXT: movq %r11, 24(%rdx)
+; FALLBACK8-NEXT: movq %rdi, (%rdx)
+; FALLBACK8-NEXT: addq $8, %rsp
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: popq %r12
+; FALLBACK8-NEXT: popq %r13
+; FALLBACK8-NEXT: popq %r14
+; FALLBACK8-NEXT: popq %r15
+; FALLBACK8-NEXT: popq %rbp
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: lshr_64bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: pushq %r15
+; FALLBACK9-NEXT: pushq %r14
+; FALLBACK9-NEXT: pushq %rbx
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK9-NEXT: movl (%rsi), %eax
+; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: andl $56, %ecx
+; FALLBACK9-NEXT: andl $56, %eax
+; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq %r9, %rsi
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK9-NEXT: movq %r10, %r8
+; FALLBACK9-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK9-NEXT: movq %r11, %rbx
+; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK9-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK9-NEXT: movq %rax, %r15
+; FALLBACK9-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: shrq %cl, %r11
+; FALLBACK9-NEXT: movq %r15, 8(%rdx)
+; FALLBACK9-NEXT: movq %r9, 48(%rdx)
+; FALLBACK9-NEXT: movq %r11, 56(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK9-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK9-NEXT: movq %r8, 16(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r14, (%rdx)
+; FALLBACK9-NEXT: popq %rbx
+; FALLBACK9-NEXT: popq %r14
+; FALLBACK9-NEXT: popq %r15
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: lshr_64bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: pushq %rbp
+; FALLBACK10-NEXT: pushq %r15
+; FALLBACK10-NEXT: pushq %r14
+; FALLBACK10-NEXT: pushq %r13
+; FALLBACK10-NEXT: pushq %r12
+; FALLBACK10-NEXT: pushq %rbx
+; FALLBACK10-NEXT: pushq %rax
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK10-NEXT: movl (%rsi), %eax
+; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: leal (,%rax,8), %esi
+; FALLBACK10-NEXT: andl $56, %esi
+; FALLBACK10-NEXT: andl $56, %eax
+; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx
+; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12
+; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9
+; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14
+; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15
+; FALLBACK10-NEXT: movl %esi, %ebx
+; FALLBACK10-NEXT: notb %bl
+; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK10-NEXT: orq %r11, %r8
+; FALLBACK10-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK10-NEXT: orq %r12, %r11
+; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13
+; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp
+; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK10-NEXT: shrxq %rsi, %rax, %rsi
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK10-NEXT: orq %r9, %rdi
+; FALLBACK10-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK10-NEXT: orq %r14, %r9
+; FALLBACK10-NEXT: addq %r10, %r10
+; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK10-NEXT: orq %r15, %r10
+; FALLBACK10-NEXT: addq %rax, %rax
+; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK10-NEXT: orq %r13, %rax
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx
+; FALLBACK10-NEXT: orq %rbp, %rcx
+; FALLBACK10-NEXT: movq %rsi, 56(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 8(%rdx)
+; FALLBACK10-NEXT: movq %rax, 48(%rdx)
+; FALLBACK10-NEXT: movq %r10, 32(%rdx)
+; FALLBACK10-NEXT: movq %r9, 40(%rdx)
+; FALLBACK10-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %r8, (%rdx)
+; FALLBACK10-NEXT: addq $8, %rsp
+; FALLBACK10-NEXT: popq %rbx
+; FALLBACK10-NEXT: popq %r12
+; FALLBACK10-NEXT: popq %r13
+; FALLBACK10-NEXT: popq %r14
+; FALLBACK10-NEXT: popq %r15
+; FALLBACK10-NEXT: popq %rbp
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: lshr_64bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: pushq %r15
+; FALLBACK11-NEXT: pushq %r14
+; FALLBACK11-NEXT: pushq %rbx
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK11-NEXT: movl (%rsi), %eax
+; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: andl $56, %ecx
+; FALLBACK11-NEXT: andl $56, %eax
+; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq %r9, %rsi
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK11-NEXT: movq %r10, %r8
+; FALLBACK11-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK11-NEXT: movq %r11, %rbx
+; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK11-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK11-NEXT: movq %rax, %r15
+; FALLBACK11-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK11-NEXT: shrxq %rcx, %r11, %r10
+; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK11-NEXT: movq %r15, 8(%rdx)
+; FALLBACK11-NEXT: movq %r9, 48(%rdx)
+; FALLBACK11-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK11-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK11-NEXT: movq %r8, 16(%rdx)
+; FALLBACK11-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK11-NEXT: movq %r14, (%rdx)
+; FALLBACK11-NEXT: movq %r10, 56(%rdx)
+; FALLBACK11-NEXT: popq %rbx
+; FALLBACK11-NEXT: popq %r14
+; FALLBACK11-NEXT: popq %r15
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: lshr_64bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbp
+; FALLBACK12-NEXT: pushq %r15
+; FALLBACK12-NEXT: pushq %r14
+; FALLBACK12-NEXT: pushq %r13
+; FALLBACK12-NEXT: pushq %r12
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: pushq %rax
+; FALLBACK12-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK12-NEXT: movl (%rsi), %r9d
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: leal (,%r9,8), %eax
+; FALLBACK12-NEXT: andl $56, %eax
+; FALLBACK12-NEXT: andl $56, %r9d
+; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rdi
+; FALLBACK12-NEXT: orq %r10, %rdi
+; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq %r10, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbx
+; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12
+; FALLBACK12-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r11
+; FALLBACK12-NEXT: orq %rbx, %r11
+; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx
+; FALLBACK12-NEXT: movq %rbx, %r14
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r14
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r14, %r10
+; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14
+; FALLBACK12-NEXT: movq %r14, %r13
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r13
+; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp
+; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r15
+; FALLBACK12-NEXT: orq %r13, %r15
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r12
+; FALLBACK12-NEXT: addq %r14, %r14
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r14
+; FALLBACK12-NEXT: orq %r12, %r14
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbp
+; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9
+; FALLBACK12-NEXT: leaq (%r9,%r9), %r12
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r12
+; FALLBACK12-NEXT: orq %rbp, %r12
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: addq %rbx, %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r8, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, 56(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK12-NEXT: movq %r12, 48(%rdx)
+; FALLBACK12-NEXT: movq %r14, 32(%rdx)
+; FALLBACK12-NEXT: movq %r15, 40(%rdx)
+; FALLBACK12-NEXT: movq %r10, 16(%rdx)
+; FALLBACK12-NEXT: movq %r11, 24(%rdx)
+; FALLBACK12-NEXT: movq %rdi, (%rdx)
+; FALLBACK12-NEXT: addq $8, %rsp
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: popq %r12
+; FALLBACK12-NEXT: popq %r13
+; FALLBACK12-NEXT: popq %r14
+; FALLBACK12-NEXT: popq %r15
+; FALLBACK12-NEXT: popq %rbp
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: lshr_64bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: pushq %r15
+; FALLBACK13-NEXT: pushq %r14
+; FALLBACK13-NEXT: pushq %rbx
+; FALLBACK13-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK13-NEXT: movl (%rsi), %edi
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: leal (,%rdi,8), %ecx
+; FALLBACK13-NEXT: andl $56, %ecx
+; FALLBACK13-NEXT: andl $56, %edi
+; FALLBACK13-NEXT: movq -96(%rsp,%rdi), %rsi
+; FALLBACK13-NEXT: movq -104(%rsp,%rdi), %r9
+; FALLBACK13-NEXT: movq %r9, %rax
+; FALLBACK13-NEXT: shrdq %cl, %rsi, %rax
+; FALLBACK13-NEXT: movq -112(%rsp,%rdi), %r10
+; FALLBACK13-NEXT: movq %r10, %r8
+; FALLBACK13-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK13-NEXT: movq -80(%rsp,%rdi), %r9
+; FALLBACK13-NEXT: movq -88(%rsp,%rdi), %r11
+; FALLBACK13-NEXT: movq %r11, %rbx
+; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK13-NEXT: shrdq %cl, %r11, %rsi
+; FALLBACK13-NEXT: movq -72(%rsp,%rdi), %r11
+; FALLBACK13-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK13-NEXT: movq -128(%rsp,%rdi), %r14
+; FALLBACK13-NEXT: movq -120(%rsp,%rdi), %rdi
+; FALLBACK13-NEXT: movq %rdi, %r15
+; FALLBACK13-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %r14
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: shrq %cl, %r11
+; FALLBACK13-NEXT: movq %r15, 8(%rdx)
+; FALLBACK13-NEXT: movq %r9, 48(%rdx)
+; FALLBACK13-NEXT: movq %r11, 56(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 32(%rdx)
+; FALLBACK13-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK13-NEXT: movq %r8, 16(%rdx)
+; FALLBACK13-NEXT: movq %rax, 24(%rdx)
+; FALLBACK13-NEXT: movq %r14, (%rdx)
+; FALLBACK13-NEXT: popq %rbx
+; FALLBACK13-NEXT: popq %r14
+; FALLBACK13-NEXT: popq %r15
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: lshr_64bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: pushq %rbp
+; FALLBACK14-NEXT: pushq %r15
+; FALLBACK14-NEXT: pushq %r14
+; FALLBACK14-NEXT: pushq %r13
+; FALLBACK14-NEXT: pushq %r12
+; FALLBACK14-NEXT: pushq %rbx
+; FALLBACK14-NEXT: pushq %rax
+; FALLBACK14-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK14-NEXT: movl (%rsi), %esi
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK14-NEXT: andl $56, %ecx
+; FALLBACK14-NEXT: andl $56, %esi
+; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r11
+; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %rax
+; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT: shrxq %rcx, %rdi, %r12
+; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r13
+; FALLBACK14-NEXT: shrxq %rcx, %rax, %r9
+; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %r10
+; FALLBACK14-NEXT: shrxq %rcx, %r10, %r14
+; FALLBACK14-NEXT: shrxq %rcx, %r13, %r15
+; FALLBACK14-NEXT: movl %ecx, %ebx
+; FALLBACK14-NEXT: notb %bl
+; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %rbp
+; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK14-NEXT: orq %r11, %r8
+; FALLBACK14-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK14-NEXT: orq %r12, %r11
+; FALLBACK14-NEXT: movq -80(%rsp,%rsi), %r12
+; FALLBACK14-NEXT: shrxq %rcx, %r12, %r13
+; FALLBACK14-NEXT: shrxq %rcx, %rbp, %rbp
+; FALLBACK14-NEXT: movq -72(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT: shrxq %rcx, %rsi, %rcx
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK14-NEXT: orq %r9, %rdi
+; FALLBACK14-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK14-NEXT: orq %r14, %r9
+; FALLBACK14-NEXT: addq %r10, %r10
+; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK14-NEXT: orq %r15, %r10
+; FALLBACK14-NEXT: addq %rsi, %rsi
+; FALLBACK14-NEXT: shlxq %rbx, %rsi, %rsi
+; FALLBACK14-NEXT: orq %r13, %rsi
+; FALLBACK14-NEXT: addq %rax, %rax
+; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK14-NEXT: orq %rbp, %rax
+; FALLBACK14-NEXT: movq %rcx, 56(%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rsi, 48(%rdx)
+; FALLBACK14-NEXT: movq %r10, 32(%rdx)
+; FALLBACK14-NEXT: movq %r9, 40(%rdx)
+; FALLBACK14-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %r8, (%rdx)
+; FALLBACK14-NEXT: addq $8, %rsp
+; FALLBACK14-NEXT: popq %rbx
+; FALLBACK14-NEXT: popq %r12
+; FALLBACK14-NEXT: popq %r13
+; FALLBACK14-NEXT: popq %r14
+; FALLBACK14-NEXT: popq %r15
+; FALLBACK14-NEXT: popq %rbp
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: lshr_64bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: pushq %r15
+; FALLBACK15-NEXT: pushq %r14
+; FALLBACK15-NEXT: pushq %rbx
+; FALLBACK15-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK15-NEXT: movl (%rsi), %eax
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: andl $56, %ecx
+; FALLBACK15-NEXT: andl $56, %eax
+; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq %r9, %rsi
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK15-NEXT: movq %r10, %r8
+; FALLBACK15-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK15-NEXT: movq %r11, %rbx
+; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK15-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK15-NEXT: movq %rax, %r15
+; FALLBACK15-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10
+; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK15-NEXT: movq %r15, 8(%rdx)
+; FALLBACK15-NEXT: movq %r9, 48(%rdx)
+; FALLBACK15-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK15-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK15-NEXT: movq %r8, 16(%rdx)
+; FALLBACK15-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK15-NEXT: movq %r14, (%rdx)
+; FALLBACK15-NEXT: movq %r10, 56(%rdx)
+; FALLBACK15-NEXT: popq %rbx
+; FALLBACK15-NEXT: popq %r14
+; FALLBACK15-NEXT: popq %r15
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: lshr_64bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $204, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl (%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 16(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 20(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 24(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 28(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 36(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 40(%eax), %ebp
+; FALLBACK16-NEXT: movl 44(%eax), %ebx
+; FALLBACK16-NEXT: movl 48(%eax), %edi
+; FALLBACK16-NEXT: movl 52(%eax), %esi
+; FALLBACK16-NEXT: movl 56(%eax), %edx
+; FALLBACK16-NEXT: movl 60(%eax), %ecx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl (%eax), %eax
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %eax, %esi
+; FALLBACK16-NEXT: andl $60, %esi
+; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK16-NEXT: shll $3, %eax
+; FALLBACK16-NEXT: andl $24, %eax
+; FALLBACK16-NEXT: movl %edx, %edi
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK16-NEXT: movb %al, %ch
+; FALLBACK16-NEXT: notb %ch
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: addl %edx, %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %edi, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK16-NEXT: movl %edx, %ebp
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %ebp, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: addl %edx, %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %ebx, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK16-NEXT: movl %ebx, %ebp
+; FALLBACK16-NEXT: movl %eax, %edx
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: addl %eax, %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %ebp, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK16-NEXT: movl %ebx, %ebp
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %ebp, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %eax, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK16-NEXT: movl %ebx, %ebp
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK16-NEXT: leal (%edx,%edx), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %ebp, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK16-NEXT: movl %edi, %ebp
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %ebp, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: addl %edi, %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %edx, %edi
+; FALLBACK16-NEXT: movl %esi, %edx
+; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK16-NEXT: movl %esi, %ebx
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK16-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %ebx, %ebp
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: addl %esi, %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %ebx, %esi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %eax, %edx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl %ebx, 60(%eax)
+; FALLBACK16-NEXT: movl %edx, 56(%eax)
+; FALLBACK16-NEXT: movl %esi, 48(%eax)
+; FALLBACK16-NEXT: movl %ebp, 52(%eax)
+; FALLBACK16-NEXT: movl %edi, 40(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 44(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 32(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 36(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 24(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 28(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 16(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 20(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 8(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 12(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, (%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 4(%eax)
+; FALLBACK16-NEXT: addl $204, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: lshr_64bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $188, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 12(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 20(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 24(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 28(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 36(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%ecx), %ebp
+; FALLBACK17-NEXT: movl 44(%ecx), %ebx
+; FALLBACK17-NEXT: movl 48(%ecx), %edi
+; FALLBACK17-NEXT: movl 52(%ecx), %esi
+; FALLBACK17-NEXT: movl 56(%ecx), %edx
+; FALLBACK17-NEXT: movl 60(%ecx), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %ecx
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ecx, %ebp
+; FALLBACK17-NEXT: andl $60, %ebp
+; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shll $3, %ecx
+; FALLBACK17-NEXT: andl $24, %ecx
+; FALLBACK17-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %esi
+; FALLBACK17-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl %esi, %edx
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl %edx, 56(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT: shrl %cl, %eax
+; FALLBACK17-NEXT: movl %eax, 60(%ebp)
+; FALLBACK17-NEXT: movl %esi, 48(%ebp)
+; FALLBACK17-NEXT: movl %edi, 52(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 40(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 44(%ebp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 32(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 36(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 24(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 28(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 16(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 20(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 8(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 12(%ebp)
+; FALLBACK17-NEXT: movl %ebx, (%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 4(%ebp)
+; FALLBACK17-NEXT: addl $188, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: lshr_64bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $204, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 12(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 16(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 20(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 24(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 28(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 32(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 36(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 40(%eax), %ebp
+; FALLBACK18-NEXT: movl 44(%eax), %ebx
+; FALLBACK18-NEXT: movl 48(%eax), %edi
+; FALLBACK18-NEXT: movl 52(%eax), %esi
+; FALLBACK18-NEXT: movl 56(%eax), %edx
+; FALLBACK18-NEXT: movl 60(%eax), %ecx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %eax
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %eax, %ecx
+; FALLBACK18-NEXT: leal (,%eax,8), %edx
+; FALLBACK18-NEXT: andl $24, %edx
+; FALLBACK18-NEXT: andl $60, %ecx
+; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK18-NEXT: movl 72(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %edi
+; FALLBACK18-NEXT: movl %edx, %ebx
+; FALLBACK18-NEXT: notb %bl
+; FALLBACK18-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT: orl %edi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: orl %edi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK18-NEXT: leal (%esi,%esi), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK18-NEXT: leal (%esi,%esi), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK18-NEXT: movl %ecx, %edi
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %ecx, %esi
+; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK18-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK18-NEXT: orl %edi, %ecx
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: addl %eax, %eax
+; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK18-NEXT: shrxl %edx, %ebp, %edx
+; FALLBACK18-NEXT: addl %ebp, %ebp
+; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK18-NEXT: orl %eax, %ebx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl %edx, 60(%eax)
+; FALLBACK18-NEXT: movl %ebx, 56(%eax)
+; FALLBACK18-NEXT: movl %edi, 48(%eax)
+; FALLBACK18-NEXT: movl %ecx, 52(%eax)
+; FALLBACK18-NEXT: movl %esi, 40(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 44(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 32(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 36(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 24(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 28(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 16(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, (%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $204, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: lshr_64bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $188, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 12(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 20(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 24(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 28(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 36(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%ecx), %ebp
+; FALLBACK19-NEXT: movl 44(%ecx), %ebx
+; FALLBACK19-NEXT: movl 48(%ecx), %edi
+; FALLBACK19-NEXT: movl 52(%ecx), %esi
+; FALLBACK19-NEXT: movl 56(%ecx), %edx
+; FALLBACK19-NEXT: movl 60(%ecx), %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %ecx
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ecx, %ebp
+; FALLBACK19-NEXT: andl $60, %ebp
+; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shll $3, %ecx
+; FALLBACK19-NEXT: andl $24, %ecx
+; FALLBACK19-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %esi
+; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl %edi, %edx
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl %eax, 56(%ebp)
+; FALLBACK19-NEXT: movl %esi, 48(%ebp)
+; FALLBACK19-NEXT: movl %edx, 52(%ebp)
+; FALLBACK19-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 44(%ebp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 32(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 36(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 24(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 28(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 16(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 20(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 8(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 12(%ebp)
+; FALLBACK19-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, (%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK19-NEXT: movl %eax, 60(%ebp)
+; FALLBACK19-NEXT: addl $188, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: lshr_64bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $204, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK20-NEXT: movl (%eax), %eax
+; FALLBACK20-NEXT: xorps %xmm4, %xmm4
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %eax, %esi
+; FALLBACK20-NEXT: andl $60, %esi
+; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK20-NEXT: shll $3, %eax
+; FALLBACK20-NEXT: andl $24, %eax
+; FALLBACK20-NEXT: movl %edx, %edi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movb %al, %ch
+; FALLBACK20-NEXT: notb %ch
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %edx, %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %edi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK20-NEXT: movl %edx, %ebp
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %edx, %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: addl %eax, %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %eax, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK20-NEXT: leal (%edx,%edx), %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK20-NEXT: movl %edi, %ebp
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: addl %edi, %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl %edx, %edi
+; FALLBACK20-NEXT: movl %esi, %edx
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK20-NEXT: movl %esi, %ebx
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK20-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %eax, %edx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %ebx, 60(%eax)
+; FALLBACK20-NEXT: movl %edx, 56(%eax)
+; FALLBACK20-NEXT: movl %esi, 48(%eax)
+; FALLBACK20-NEXT: movl %ebp, 52(%eax)
+; FALLBACK20-NEXT: movl %edi, 40(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 44(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 32(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 36(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 24(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 28(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 16(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, (%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 4(%eax)
+; FALLBACK20-NEXT: addl $204, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: lshr_64bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $188, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movups (%ecx), %xmm0
+; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK21-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK21-NEXT: movl (%eax), %ecx
+; FALLBACK21-NEXT: xorps %xmm4, %xmm4
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %ecx, %ebp
+; FALLBACK21-NEXT: andl $60, %ebp
+; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shll $3, %ecx
+; FALLBACK21-NEXT: andl $24, %ecx
+; FALLBACK21-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %esi
+; FALLBACK21-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl %esi, %edx
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edi
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edi
+; FALLBACK21-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %edx, 56(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT: shrl %cl, %eax
+; FALLBACK21-NEXT: movl %eax, 60(%ebp)
+; FALLBACK21-NEXT: movl %esi, 48(%ebp)
+; FALLBACK21-NEXT: movl %edi, 52(%ebp)
+; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 40(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 44(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 32(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 36(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 24(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %ebx, (%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 4(%ebp)
+; FALLBACK21-NEXT: addl $188, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: lshr_64bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $204, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK22-NEXT: movl (%eax), %ecx
+; FALLBACK22-NEXT: xorps %xmm4, %xmm4
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: leal (,%ecx,8), %edx
+; FALLBACK22-NEXT: andl $24, %edx
+; FALLBACK22-NEXT: andl $60, %ecx
+; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl 72(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %edi
+; FALLBACK22-NEXT: movl %edx, %ebx
+; FALLBACK22-NEXT: notb %bl
+; FALLBACK22-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp
+; FALLBACK22-NEXT: orl %edi, %ebp
+; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %edi, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT: orl %eax, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT: orl %eax, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: movl 112(%esp,%ecx), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %ecx
+; FALLBACK22-NEXT: movl 108(%esp,%eax), %esi
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl 120(%esp,%eax), %ebp
+; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: movl 116(%esp,%eax), %eax
+; FALLBACK22-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK22-NEXT: orl %edi, %ecx
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %eax, %eax
+; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK22-NEXT: shrxl %edx, %ebp, %edx
+; FALLBACK22-NEXT: addl %ebp, %ebp
+; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK22-NEXT: orl %eax, %ebx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl %edx, 60(%eax)
+; FALLBACK22-NEXT: movl %ebx, 56(%eax)
+; FALLBACK22-NEXT: movl %edi, 48(%eax)
+; FALLBACK22-NEXT: movl %ecx, 52(%eax)
+; FALLBACK22-NEXT: movl %esi, 40(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 44(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 32(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 36(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 24(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 28(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 16(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 20(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 8(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 12(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, (%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 4(%eax)
+; FALLBACK22-NEXT: addl $204, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: lshr_64bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $188, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movups (%ecx), %xmm0
+; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK23-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK23-NEXT: movl (%eax), %ecx
+; FALLBACK23-NEXT: xorps %xmm4, %xmm4
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %ecx, %ebp
+; FALLBACK23-NEXT: andl $60, %ebp
+; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shll $3, %ecx
+; FALLBACK23-NEXT: andl $24, %ecx
+; FALLBACK23-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %esi
+; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl %edi, %edx
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT: movl %eax, 56(%ebp)
+; FALLBACK23-NEXT: movl %esi, 48(%ebp)
+; FALLBACK23-NEXT: movl %edx, 52(%ebp)
+; FALLBACK23-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 44(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 32(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 36(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 24(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 28(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 16(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 20(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 8(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 12(%ebp)
+; FALLBACK23-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, (%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK23-NEXT: movl %eax, 60(%ebp)
+; FALLBACK23-NEXT: addl $188, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: lshr_64bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $204, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK24-NEXT: movl (%eax), %ecx
+; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, %esi
+; FALLBACK24-NEXT: andl $60, %esi
+; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK24-NEXT: shll $3, %ecx
+; FALLBACK24-NEXT: andl $24, %ecx
+; FALLBACK24-NEXT: movl %edx, %edi
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: movl 72(%esp,%esi), %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%eax,%eax), %ebx
+; FALLBACK24-NEXT: movl %ecx, %ebp
+; FALLBACK24-NEXT: movb %cl, %ch
+; FALLBACK24-NEXT: notb %ch
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK24-NEXT: movl %ebp, %eax
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %edx, %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %edi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK24-NEXT: movl %edx, %ebp
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %edx, %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: addl %eax, %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %eax, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK24-NEXT: leal (%edx,%edx), %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK24-NEXT: movl %edi, %ebp
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: addl %edi, %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl %edx, %edi
+; FALLBACK24-NEXT: movl %esi, %edx
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK24-NEXT: movl %esi, %ebx
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK24-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %eax, %edx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %ebx, 60(%eax)
+; FALLBACK24-NEXT: movl %edx, 56(%eax)
+; FALLBACK24-NEXT: movl %esi, 48(%eax)
+; FALLBACK24-NEXT: movl %ebp, 52(%eax)
+; FALLBACK24-NEXT: movl %edi, 40(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 44(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 32(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 36(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 24(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 28(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 16(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, (%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 4(%eax)
+; FALLBACK24-NEXT: addl $204, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: lshr_64bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $188, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK25-NEXT: movl (%eax), %ecx
+; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %ecx, %ebp
+; FALLBACK25-NEXT: andl $60, %ebp
+; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shll $3, %ecx
+; FALLBACK25-NEXT: andl $24, %ecx
+; FALLBACK25-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %esi
+; FALLBACK25-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl %esi, %edx
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edi
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edi
+; FALLBACK25-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %edx, 56(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT: shrl %cl, %eax
+; FALLBACK25-NEXT: movl %eax, 60(%ebp)
+; FALLBACK25-NEXT: movl %esi, 48(%ebp)
+; FALLBACK25-NEXT: movl %edi, 52(%ebp)
+; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 40(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 44(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 32(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 36(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 24(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %ebx, (%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 4(%ebp)
+; FALLBACK25-NEXT: addl $188, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: lshr_64bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $204, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK26-NEXT: movl (%eax), %ecx
+; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: leal (,%ecx,8), %edx
+; FALLBACK26-NEXT: andl $24, %edx
+; FALLBACK26-NEXT: andl $60, %ecx
+; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl 72(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %edi
+; FALLBACK26-NEXT: movl %edx, %ebx
+; FALLBACK26-NEXT: notb %bl
+; FALLBACK26-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp
+; FALLBACK26-NEXT: orl %edi, %ebp
+; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %edi, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT: orl %eax, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT: orl %eax, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %eax, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 120(%esp,%ecx), %ebp
+; FALLBACK26-NEXT: leal (%ebp,%ebp), %eax
+; FALLBACK26-NEXT: shlxl %ebx, %eax, %esi
+; FALLBACK26-NEXT: movl 116(%esp,%ecx), %eax
+; FALLBACK26-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK26-NEXT: orl %edi, %esi
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %eax, %eax
+; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK26-NEXT: movl 124(%esp,%ecx), %ecx
+; FALLBACK26-NEXT: shrxl %edx, %ecx, %edx
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebx
+; FALLBACK26-NEXT: orl %eax, %ebx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: movl %edx, 60(%ecx)
+; FALLBACK26-NEXT: movl %ebx, 56(%ecx)
+; FALLBACK26-NEXT: movl %edi, 48(%ecx)
+; FALLBACK26-NEXT: movl %esi, 52(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 40(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 44(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 32(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 36(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 24(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 28(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 16(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 20(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 8(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 12(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, (%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 4(%ecx)
+; FALLBACK26-NEXT: addl $204, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: lshr_64bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $188, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK27-NEXT: movl (%eax), %ecx
+; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %ecx, %ebp
+; FALLBACK27-NEXT: andl $60, %ebp
+; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shll $3, %ecx
+; FALLBACK27-NEXT: andl $24, %ecx
+; FALLBACK27-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %esi
+; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl %edi, %edx
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT: movl %eax, 56(%ebp)
+; FALLBACK27-NEXT: movl %esi, 48(%ebp)
+; FALLBACK27-NEXT: movl %edx, 52(%ebp)
+; FALLBACK27-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 44(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 32(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 36(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 24(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 28(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 16(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 20(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 8(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 12(%ebp)
+; FALLBACK27-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, (%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK27-NEXT: movl %eax, 60(%ebp)
+; FALLBACK27-NEXT: addl $188, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: lshr_64bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $204, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK28-NEXT: movl (%eax), %ecx
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, %esi
+; FALLBACK28-NEXT: andl $60, %esi
+; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK28-NEXT: shll $3, %ecx
+; FALLBACK28-NEXT: andl $24, %ecx
+; FALLBACK28-NEXT: movl %edx, %edi
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: movl 72(%esp,%esi), %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%eax,%eax), %ebx
+; FALLBACK28-NEXT: movl %ecx, %ebp
+; FALLBACK28-NEXT: movb %cl, %ch
+; FALLBACK28-NEXT: notb %ch
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK28-NEXT: movl %ebp, %eax
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %edx, %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %edi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK28-NEXT: movl %edx, %ebp
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %edx, %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: addl %eax, %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %eax, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK28-NEXT: leal (%edx,%edx), %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK28-NEXT: movl %edi, %ebp
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: addl %edi, %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl %edx, %edi
+; FALLBACK28-NEXT: movl %esi, %edx
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK28-NEXT: movl %esi, %ebx
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK28-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %eax, %edx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %ebx, 60(%eax)
+; FALLBACK28-NEXT: movl %edx, 56(%eax)
+; FALLBACK28-NEXT: movl %esi, 48(%eax)
+; FALLBACK28-NEXT: movl %ebp, 52(%eax)
+; FALLBACK28-NEXT: movl %edi, 40(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 44(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 32(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 36(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 24(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 28(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 16(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, (%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 4(%eax)
+; FALLBACK28-NEXT: addl $204, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: lshr_64bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $188, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK29-NEXT: movl (%eax), %ecx
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %ecx, %ebp
+; FALLBACK29-NEXT: andl $60, %ebp
+; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shll $3, %ecx
+; FALLBACK29-NEXT: andl $24, %ecx
+; FALLBACK29-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %esi
+; FALLBACK29-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl %esi, %edx
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edi
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edi
+; FALLBACK29-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %edx, 56(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT: shrl %cl, %eax
+; FALLBACK29-NEXT: movl %eax, 60(%ebp)
+; FALLBACK29-NEXT: movl %esi, 48(%ebp)
+; FALLBACK29-NEXT: movl %edi, 52(%ebp)
+; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 40(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 44(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 32(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 36(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 24(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %ebx, (%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 4(%ebp)
+; FALLBACK29-NEXT: addl $188, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: lshr_64bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $204, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK30-NEXT: movl (%eax), %edx
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: leal (,%edx,8), %ecx
+; FALLBACK30-NEXT: andl $24, %ecx
+; FALLBACK30-NEXT: andl $60, %edx
+; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi
+; FALLBACK30-NEXT: movl 72(%esp,%edx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, %esi, %edi
+; FALLBACK30-NEXT: movl %ecx, %ebx
+; FALLBACK30-NEXT: notb %bl
+; FALLBACK30-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT: orl %edi, %ebp
+; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, 64(%esp,%edx), %edi
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %edi, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 76(%esp,%edx), %edi
+; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT: orl %eax, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 88(%esp,%edx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 84(%esp,%edx), %edi
+; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 96(%esp,%edx), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 92(%esp,%edx), %edi
+; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT: orl %eax, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 104(%esp,%edx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 100(%esp,%edx), %edi
+; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 112(%esp,%edx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT: movl 108(%esp,%edx), %esi
+; FALLBACK30-NEXT: shrxl %ecx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %eax, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 120(%esp,%edx), %ebp
+; FALLBACK30-NEXT: leal (%ebp,%ebp), %eax
+; FALLBACK30-NEXT: shlxl %ebx, %eax, %esi
+; FALLBACK30-NEXT: movl 116(%esp,%edx), %eax
+; FALLBACK30-NEXT: shrxl %ecx, %eax, %edi
+; FALLBACK30-NEXT: orl %edi, %esi
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %eax, %eax
+; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrxl %ecx, %ebp, %eax
+; FALLBACK30-NEXT: movl 124(%esp,%edx), %edx
+; FALLBACK30-NEXT: shrxl %ecx, %edx, %ebp
+; FALLBACK30-NEXT: leal (%edx,%edx), %ecx
+; FALLBACK30-NEXT: shlxl %ebx, %ecx, %edx
+; FALLBACK30-NEXT: orl %eax, %edx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: movl %ebp, 60(%ecx)
+; FALLBACK30-NEXT: movl %edx, 56(%ecx)
+; FALLBACK30-NEXT: movl %edi, 48(%ecx)
+; FALLBACK30-NEXT: movl %esi, 52(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 40(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 44(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 32(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 36(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 24(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 28(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 16(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 20(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 8(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 12(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, (%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 4(%ecx)
+; FALLBACK30-NEXT: addl $204, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: lshr_64bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $188, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK31-NEXT: movl (%eax), %ecx
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %ecx, %ebp
+; FALLBACK31-NEXT: andl $60, %ebp
+; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shll $3, %ecx
+; FALLBACK31-NEXT: andl $24, %ecx
+; FALLBACK31-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %esi
+; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl %edi, %edx
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT: movl %eax, 56(%ebp)
+; FALLBACK31-NEXT: movl %esi, 48(%ebp)
+; FALLBACK31-NEXT: movl %edx, 52(%ebp)
+; FALLBACK31-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 44(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 32(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 36(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 24(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 28(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 16(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 20(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 8(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 12(%ebp)
+; FALLBACK31-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, (%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK31-NEXT: movl %eax, 60(%ebp)
+; FALLBACK31-NEXT: addl $188, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%byteOff = load i512, ptr %byteOff.ptr, align 1
%bitOff = shl i512 %byteOff, 3
@@ -2000,363 +12379,3775 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
ret void
}
define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: shl_64bytes:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: pushq %rbx
-; X64-SSE2-NEXT: movq (%rdi), %rax
-; X64-SSE2-NEXT: movq 8(%rdi), %rcx
-; X64-SSE2-NEXT: movq 16(%rdi), %r8
-; X64-SSE2-NEXT: movq 24(%rdi), %r9
-; X64-SSE2-NEXT: movq 32(%rdi), %r10
-; X64-SSE2-NEXT: movq 40(%rdi), %r11
-; X64-SSE2-NEXT: movq 48(%rdi), %rbx
-; X64-SSE2-NEXT: movq 56(%rdi), %rdi
-; X64-SSE2-NEXT: movl (%rsi), %esi
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $63, %esi
-; X64-SSE2-NEXT: negl %esi
-; X64-SSE2-NEXT: movslq %esi, %rax
-; X64-SSE2-NEXT: movq -64(%rsp,%rax), %rcx
-; X64-SSE2-NEXT: movq -56(%rsp,%rax), %rsi
-; X64-SSE2-NEXT: movq -40(%rsp,%rax), %rdi
-; X64-SSE2-NEXT: movq -48(%rsp,%rax), %r8
-; X64-SSE2-NEXT: movq -24(%rsp,%rax), %r9
-; X64-SSE2-NEXT: movq -32(%rsp,%rax), %r10
-; X64-SSE2-NEXT: movq -8(%rsp,%rax), %r11
-; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rax
-; X64-SSE2-NEXT: movq %rax, 48(%rdx)
-; X64-SSE2-NEXT: movq %r11, 56(%rdx)
-; X64-SSE2-NEXT: movq %r10, 32(%rdx)
-; X64-SSE2-NEXT: movq %r9, 40(%rdx)
-; X64-SSE2-NEXT: movq %r8, 16(%rdx)
-; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT: movq %rcx, (%rdx)
-; X64-SSE2-NEXT: movq %rsi, 8(%rdx)
-; X64-SSE2-NEXT: popq %rbx
-; X64-SSE2-NEXT: retq
-;
-; X64-SSE42-LABEL: shl_64bytes:
-; X64-SSE42: # %bb.0:
-; X64-SSE42-NEXT: movups (%rdi), %xmm0
-; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
-; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
-; X64-SSE42-NEXT: movups 48(%rdi), %xmm3
-; X64-SSE42-NEXT: movl (%rsi), %eax
-; X64-SSE42-NEXT: xorps %xmm4, %xmm4
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm3, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $63, %eax
-; X64-SSE42-NEXT: negl %eax
-; X64-SSE42-NEXT: cltq
-; X64-SSE42-NEXT: movups -64(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT: movups -48(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT: movups -32(%rsp,%rax), %xmm2
-; X64-SSE42-NEXT: movups -16(%rsp,%rax), %xmm3
-; X64-SSE42-NEXT: movups %xmm3, 48(%rdx)
-; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
-; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
-; X64-SSE42-NEXT: movups %xmm0, (%rdx)
-; X64-SSE42-NEXT: retq
-;
-; X64-AVX1-LABEL: shl_64bytes:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT: movl (%rsi), %eax
-; X64-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: andl $63, %eax
-; X64-AVX1-NEXT: negl %eax
-; X64-AVX1-NEXT: cltq
-; X64-AVX1-NEXT: vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX1-NEXT: vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX1-NEXT: vmovups -32(%rsp,%rax), %xmm2
-; X64-AVX1-NEXT: vmovups -16(%rsp,%rax), %xmm3
-; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx)
-; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx)
-; X64-AVX1-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX1-NEXT: vzeroupper
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX512-LABEL: shl_64bytes:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-AVX512-NEXT: movl (%rsi), %eax
-; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: andl $63, %eax
-; X64-AVX512-NEXT: negl %eax
-; X64-AVX512-NEXT: cltq
-; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX512-NEXT: vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX512-NEXT: vmovups -32(%rsp,%rax), %xmm2
-; X64-AVX512-NEXT: vmovups -16(%rsp,%rax), %xmm3
-; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX512-NEXT: vzeroupper
-; X64-AVX512-NEXT: retq
-;
-; X86-SSE2-LABEL: shl_64bytes:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: pushl %ebx
-; X86-SSE2-NEXT: pushl %edi
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $168, %esp
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl (%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 4(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 8(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 12(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 16(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 20(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 24(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 28(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 32(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 36(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 40(%eax), %ebp
-; X86-SSE2-NEXT: movl 44(%eax), %ebx
-; X86-SSE2-NEXT: movl 48(%eax), %edi
-; X86-SSE2-NEXT: movl 52(%eax), %esi
-; X86-SSE2-NEXT: movl 56(%eax), %edx
-; X86-SSE2-NEXT: movl 60(%eax), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl (%eax), %eax
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $63, %eax
-; X86-SSE2-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT: subl %eax, %ecx
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%ecx), %edx
-; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 4(%ecx), %edx
-; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 12(%ecx), %edx
-; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 8(%ecx), %edx
-; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 20(%ecx), %edx
-; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 16(%ecx), %edx
-; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 28(%ecx), %edx
-; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 24(%ecx), %edx
-; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 36(%ecx), %edx
-; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 32(%ecx), %edx
-; X86-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 44(%ecx), %ebp
-; X86-SSE2-NEXT: movl 40(%ecx), %ebx
-; X86-SSE2-NEXT: movl 52(%ecx), %edi
-; X86-SSE2-NEXT: movl 60(%ecx), %esi
-; X86-SSE2-NEXT: movl 56(%ecx), %edx
-; X86-SSE2-NEXT: negl %eax
-; X86-SSE2-NEXT: movl 152(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl %edx, 56(%eax)
-; X86-SSE2-NEXT: movl %esi, 60(%eax)
-; X86-SSE2-NEXT: movl %ecx, 48(%eax)
-; X86-SSE2-NEXT: movl %edi, 52(%eax)
-; X86-SSE2-NEXT: movl %ebx, 40(%eax)
-; X86-SSE2-NEXT: movl %ebp, 44(%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 32(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 36(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 24(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 28(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 16(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 20(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 8(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 12(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, (%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $168, %esp
-; X86-SSE2-NEXT: popl %esi
-; X86-SSE2-NEXT: popl %edi
-; X86-SSE2-NEXT: popl %ebx
-; X86-SSE2-NEXT: popl %ebp
-; X86-SSE2-NEXT: retl
-;
-; X86-SSE42-LABEL: shl_64bytes:
-; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $128, %esp
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT: movups (%edx), %xmm0
-; X86-SSE42-NEXT: movups 16(%edx), %xmm1
-; X86-SSE42-NEXT: movups 32(%edx), %xmm2
-; X86-SSE42-NEXT: movups 48(%edx), %xmm3
-; X86-SSE42-NEXT: movl (%ecx), %ecx
-; X86-SSE42-NEXT: xorps %xmm4, %xmm4
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, (%esp)
-; X86-SSE42-NEXT: movups %xmm3, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andl $63, %ecx
-; X86-SSE42-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT: subl %ecx, %edx
-; X86-SSE42-NEXT: movups (%edx), %xmm0
-; X86-SSE42-NEXT: movups 16(%edx), %xmm1
-; X86-SSE42-NEXT: movups 32(%edx), %xmm2
-; X86-SSE42-NEXT: negl %ecx
-; X86-SSE42-NEXT: movups 112(%esp,%ecx), %xmm3
-; X86-SSE42-NEXT: movups %xmm3, 48(%eax)
-; X86-SSE42-NEXT: movups %xmm2, 32(%eax)
-; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
-; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $128, %esp
-; X86-SSE42-NEXT: retl
-;
-; X86-AVX1-LABEL: shl_64bytes:
-; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: subl $128, %esp
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT: vmovups (%edx), %ymm0
-; X86-AVX1-NEXT: vmovups 32(%edx), %ymm1
-; X86-AVX1-NEXT: movl (%ecx), %ecx
-; X86-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X86-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovups %ymm2, (%esp)
-; X86-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: andl $63, %ecx
-; X86-AVX1-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT: subl %ecx, %edx
-; X86-AVX1-NEXT: vmovups (%edx), %xmm0
-; X86-AVX1-NEXT: vmovups 16(%edx), %xmm1
-; X86-AVX1-NEXT: vmovups 32(%edx), %xmm2
-; X86-AVX1-NEXT: negl %ecx
-; X86-AVX1-NEXT: vmovups 112(%esp,%ecx), %xmm3
-; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax)
-; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax)
-; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax)
-; X86-AVX1-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX1-NEXT: addl $128, %esp
-; X86-AVX1-NEXT: vzeroupper
-; X86-AVX1-NEXT: retl
-;
-; X86-AVX512-LABEL: shl_64bytes:
-; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: subl $128, %esp
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT: vmovups (%edx), %zmm0
-; X86-AVX512-NEXT: movl (%ecx), %ecx
-; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT: vmovups %zmm1, (%esp)
-; X86-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-AVX512-NEXT: andl $63, %ecx
-; X86-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT: subl %ecx, %edx
-; X86-AVX512-NEXT: vmovups (%edx), %xmm0
-; X86-AVX512-NEXT: vmovups 16(%edx), %xmm1
-; X86-AVX512-NEXT: vmovups 32(%edx), %xmm2
-; X86-AVX512-NEXT: negl %ecx
-; X86-AVX512-NEXT: vmovups 112(%esp,%ecx), %xmm3
-; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax)
-; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax)
-; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax)
-; X86-AVX512-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT: addl $128, %esp
-; X86-AVX512-NEXT: vzeroupper
-; X86-AVX512-NEXT: retl
+; FALLBACK0-LABEL: shl_64bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %r15
+; FALLBACK0-NEXT: pushq %r14
+; FALLBACK0-NEXT: pushq %r13
+; FALLBACK0-NEXT: pushq %r12
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq 16(%rdi), %rax
+; FALLBACK0-NEXT: movq 32(%rdi), %rcx
+; FALLBACK0-NEXT: movq 48(%rdi), %r8
+; FALLBACK0-NEXT: movq (%rdi), %r9
+; FALLBACK0-NEXT: movq 8(%rdi), %r10
+; FALLBACK0-NEXT: movq 24(%rdi), %r11
+; FALLBACK0-NEXT: movq 40(%rdi), %rbx
+; FALLBACK0-NEXT: movq 56(%rdi), %rdi
+; FALLBACK0-NEXT: movl (%rsi), %esi
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: leal (,%rsi,8), %eax
+; FALLBACK0-NEXT: andl $56, %eax
+; FALLBACK0-NEXT: andl $56, %esi
+; FALLBACK0-NEXT: negl %esi
+; FALLBACK0-NEXT: movslq %esi, %rbx
+; FALLBACK0-NEXT: movq -56(%rsp,%rbx), %rdi
+; FALLBACK0-NEXT: movq -40(%rsp,%rbx), %r8
+; FALLBACK0-NEXT: movq %rdi, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -64(%rsp,%rbx), %r10
+; FALLBACK0-NEXT: movq -48(%rsp,%rbx), %r14
+; FALLBACK0-NEXT: movq %r10, %r9
+; FALLBACK0-NEXT: shrq %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r9
+; FALLBACK0-NEXT: orq %r11, %r9
+; FALLBACK0-NEXT: movq %r8, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r15
+; FALLBACK0-NEXT: movq %r14, %r11
+; FALLBACK0-NEXT: shrq %r11
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: orq %r15, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r14
+; FALLBACK0-NEXT: shrq %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rdi
+; FALLBACK0-NEXT: orq %r14, %rdi
+; FALLBACK0-NEXT: movq -24(%rsp,%rbx), %r14
+; FALLBACK0-NEXT: movq %r14, %r12
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r12
+; FALLBACK0-NEXT: movq -32(%rsp,%rbx), %r13
+; FALLBACK0-NEXT: movq %r13, %r15
+; FALLBACK0-NEXT: shrq %r15
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r15
+; FALLBACK0-NEXT: orq %r12, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r13
+; FALLBACK0-NEXT: shrq %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r8
+; FALLBACK0-NEXT: orq %r13, %r8
+; FALLBACK0-NEXT: movq -8(%rsp,%rbx), %r12
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r12
+; FALLBACK0-NEXT: movq -16(%rsp,%rbx), %rbx
+; FALLBACK0-NEXT: movq %rbx, %r13
+; FALLBACK0-NEXT: shrq %r13
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r13
+; FALLBACK0-NEXT: orq %r12, %r13
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rbx
+; FALLBACK0-NEXT: shrq %r14
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r14
+; FALLBACK0-NEXT: orq %rbx, %r14
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: movq %r10, (%rdx)
+; FALLBACK0-NEXT: movq %r14, 48(%rdx)
+; FALLBACK0-NEXT: movq %r13, 56(%rdx)
+; FALLBACK0-NEXT: movq %r8, 32(%rdx)
+; FALLBACK0-NEXT: movq %r15, 40(%rdx)
+; FALLBACK0-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK0-NEXT: movq %r11, 24(%rdx)
+; FALLBACK0-NEXT: movq %r9, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: popq %r12
+; FALLBACK0-NEXT: popq %r13
+; FALLBACK0-NEXT: popq %r14
+; FALLBACK0-NEXT: popq %r15
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: shl_64bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: pushq %r14
+; FALLBACK1-NEXT: pushq %rbx
+; FALLBACK1-NEXT: pushq %rax
+; FALLBACK1-NEXT: movq 24(%rdi), %rax
+; FALLBACK1-NEXT: movq 40(%rdi), %rcx
+; FALLBACK1-NEXT: movq 56(%rdi), %r8
+; FALLBACK1-NEXT: movq (%rdi), %r9
+; FALLBACK1-NEXT: movq 8(%rdi), %r10
+; FALLBACK1-NEXT: movq 16(%rdi), %r11
+; FALLBACK1-NEXT: movq 32(%rdi), %rbx
+; FALLBACK1-NEXT: movq 48(%rdi), %rdi
+; FALLBACK1-NEXT: movl (%rsi), %esi
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT: andl $56, %ecx
+; FALLBACK1-NEXT: andl $56, %esi
+; FALLBACK1-NEXT: negl %esi
+; FALLBACK1-NEXT: movslq %esi, %r9
+; FALLBACK1-NEXT: movq -64(%rsp,%r9), %rax
+; FALLBACK1-NEXT: movq -48(%rsp,%r9), %rsi
+; FALLBACK1-NEXT: movq -56(%rsp,%r9), %r10
+; FALLBACK1-NEXT: movq -40(%rsp,%r9), %r11
+; FALLBACK1-NEXT: movq %r10, %rdi
+; FALLBACK1-NEXT: shldq %cl, %rax, %rdi
+; FALLBACK1-NEXT: movq %r11, %r8
+; FALLBACK1-NEXT: shldq %cl, %rsi, %r8
+; FALLBACK1-NEXT: shldq %cl, %r10, %rsi
+; FALLBACK1-NEXT: movq -32(%rsp,%r9), %r10
+; FALLBACK1-NEXT: movq -24(%rsp,%r9), %rbx
+; FALLBACK1-NEXT: movq %rbx, %r14
+; FALLBACK1-NEXT: shldq %cl, %r10, %r14
+; FALLBACK1-NEXT: shldq %cl, %r11, %r10
+; FALLBACK1-NEXT: movq -16(%rsp,%r9), %r11
+; FALLBACK1-NEXT: movq -8(%rsp,%r9), %r9
+; FALLBACK1-NEXT: shldq %cl, %r11, %r9
+; FALLBACK1-NEXT: shldq %cl, %rbx, %r11
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: shlq %cl, %rax
+; FALLBACK1-NEXT: movq %r11, 48(%rdx)
+; FALLBACK1-NEXT: movq %r9, 56(%rdx)
+; FALLBACK1-NEXT: movq %r10, 32(%rdx)
+; FALLBACK1-NEXT: movq %r14, 40(%rdx)
+; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT: movq %r8, 24(%rdx)
+; FALLBACK1-NEXT: movq %rax, (%rdx)
+; FALLBACK1-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK1-NEXT: addq $8, %rsp
+; FALLBACK1-NEXT: popq %rbx
+; FALLBACK1-NEXT: popq %r14
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: shl_64bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: pushq %rbp
+; FALLBACK2-NEXT: pushq %r15
+; FALLBACK2-NEXT: pushq %r14
+; FALLBACK2-NEXT: pushq %r13
+; FALLBACK2-NEXT: pushq %r12
+; FALLBACK2-NEXT: pushq %rbx
+; FALLBACK2-NEXT: pushq %rax
+; FALLBACK2-NEXT: movq 16(%rdi), %rax
+; FALLBACK2-NEXT: movq 32(%rdi), %rcx
+; FALLBACK2-NEXT: movq 48(%rdi), %r8
+; FALLBACK2-NEXT: movq (%rdi), %r9
+; FALLBACK2-NEXT: movq 8(%rdi), %r10
+; FALLBACK2-NEXT: movq 24(%rdi), %r11
+; FALLBACK2-NEXT: movq 40(%rdi), %rbx
+; FALLBACK2-NEXT: movq 56(%rdi), %rdi
+; FALLBACK2-NEXT: movl (%rsi), %esi
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: leal (,%rsi,8), %eax
+; FALLBACK2-NEXT: andl $56, %eax
+; FALLBACK2-NEXT: andl $56, %esi
+; FALLBACK2-NEXT: negl %esi
+; FALLBACK2-NEXT: movslq %esi, %rdi
+; FALLBACK2-NEXT: movq -56(%rsp,%rdi), %rsi
+; FALLBACK2-NEXT: movq -40(%rsp,%rdi), %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %r9
+; FALLBACK2-NEXT: movq -64(%rsp,%rdi), %r14
+; FALLBACK2-NEXT: movq -48(%rsp,%rdi), %r10
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %rbx
+; FALLBACK2-NEXT: shlxq %rax, %r10, %r11
+; FALLBACK2-NEXT: movq -24(%rsp,%rdi), %r8
+; FALLBACK2-NEXT: shlxq %rax, %r8, %r15
+; FALLBACK2-NEXT: shlxq %rax, %r14, %r12
+; FALLBACK2-NEXT: movl %eax, %r13d
+; FALLBACK2-NEXT: notb %r13b
+; FALLBACK2-NEXT: shrq %r14
+; FALLBACK2-NEXT: shrxq %r13, %r14, %r14
+; FALLBACK2-NEXT: orq %r9, %r14
+; FALLBACK2-NEXT: movq -32(%rsp,%rdi), %r9
+; FALLBACK2-NEXT: shlxq %rax, %r9, %rbp
+; FALLBACK2-NEXT: shrq %r10
+; FALLBACK2-NEXT: shrxq %r13, %r10, %r10
+; FALLBACK2-NEXT: orq %rbx, %r10
+; FALLBACK2-NEXT: shlxq %rax, -8(%rsp,%rdi), %rbx
+; FALLBACK2-NEXT: movq -16(%rsp,%rdi), %rdi
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %rax
+; FALLBACK2-NEXT: shrq %rsi
+; FALLBACK2-NEXT: shrxq %r13, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r11, %rsi
+; FALLBACK2-NEXT: shrq %r9
+; FALLBACK2-NEXT: shrxq %r13, %r9, %r9
+; FALLBACK2-NEXT: orq %r15, %r9
+; FALLBACK2-NEXT: shrq %rcx
+; FALLBACK2-NEXT: shrxq %r13, %rcx, %rcx
+; FALLBACK2-NEXT: orq %rbp, %rcx
+; FALLBACK2-NEXT: shrq %rdi
+; FALLBACK2-NEXT: shrxq %r13, %rdi, %rdi
+; FALLBACK2-NEXT: orq %rbx, %rdi
+; FALLBACK2-NEXT: shrq %r8
+; FALLBACK2-NEXT: shrxq %r13, %r8, %r8
+; FALLBACK2-NEXT: orq %rax, %r8
+; FALLBACK2-NEXT: movq %r12, (%rdx)
+; FALLBACK2-NEXT: movq %r8, 48(%rdx)
+; FALLBACK2-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK2-NEXT: movq %rcx, 32(%rdx)
+; FALLBACK2-NEXT: movq %r9, 40(%rdx)
+; FALLBACK2-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK2-NEXT: movq %r10, 24(%rdx)
+; FALLBACK2-NEXT: movq %r14, 8(%rdx)
+; FALLBACK2-NEXT: addq $8, %rsp
+; FALLBACK2-NEXT: popq %rbx
+; FALLBACK2-NEXT: popq %r12
+; FALLBACK2-NEXT: popq %r13
+; FALLBACK2-NEXT: popq %r14
+; FALLBACK2-NEXT: popq %r15
+; FALLBACK2-NEXT: popq %rbp
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: shl_64bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: pushq %r14
+; FALLBACK3-NEXT: pushq %rbx
+; FALLBACK3-NEXT: pushq %rax
+; FALLBACK3-NEXT: movq 24(%rdi), %rax
+; FALLBACK3-NEXT: movq 40(%rdi), %rcx
+; FALLBACK3-NEXT: movq 56(%rdi), %r8
+; FALLBACK3-NEXT: movq (%rdi), %r9
+; FALLBACK3-NEXT: movq 8(%rdi), %r10
+; FALLBACK3-NEXT: movq 16(%rdi), %r11
+; FALLBACK3-NEXT: movq 32(%rdi), %rbx
+; FALLBACK3-NEXT: movq 48(%rdi), %rdi
+; FALLBACK3-NEXT: movl (%rsi), %esi
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT: andl $56, %ecx
+; FALLBACK3-NEXT: andl $56, %esi
+; FALLBACK3-NEXT: negl %esi
+; FALLBACK3-NEXT: movslq %esi, %r8
+; FALLBACK3-NEXT: movq -64(%rsp,%r8), %rdi
+; FALLBACK3-NEXT: movq -48(%rsp,%r8), %rax
+; FALLBACK3-NEXT: movq -56(%rsp,%r8), %r9
+; FALLBACK3-NEXT: movq -40(%rsp,%r8), %r10
+; FALLBACK3-NEXT: movq %r9, %rsi
+; FALLBACK3-NEXT: shldq %cl, %rdi, %rsi
+; FALLBACK3-NEXT: movq %r10, %r11
+; FALLBACK3-NEXT: shldq %cl, %rax, %r11
+; FALLBACK3-NEXT: shldq %cl, %r9, %rax
+; FALLBACK3-NEXT: movq -32(%rsp,%r8), %r9
+; FALLBACK3-NEXT: movq -24(%rsp,%r8), %rbx
+; FALLBACK3-NEXT: movq %rbx, %r14
+; FALLBACK3-NEXT: shldq %cl, %r9, %r14
+; FALLBACK3-NEXT: shldq %cl, %r10, %r9
+; FALLBACK3-NEXT: movq -16(%rsp,%r8), %r10
+; FALLBACK3-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK3-NEXT: shldq %cl, %r10, %r8
+; FALLBACK3-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK3-NEXT: shlxq %rcx, %rdi, %rcx
+; FALLBACK3-NEXT: movq %r10, 48(%rdx)
+; FALLBACK3-NEXT: movq %r8, 56(%rdx)
+; FALLBACK3-NEXT: movq %r9, 32(%rdx)
+; FALLBACK3-NEXT: movq %r14, 40(%rdx)
+; FALLBACK3-NEXT: movq %rax, 16(%rdx)
+; FALLBACK3-NEXT: movq %r11, 24(%rdx)
+; FALLBACK3-NEXT: movq %rcx, (%rdx)
+; FALLBACK3-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK3-NEXT: addq $8, %rsp
+; FALLBACK3-NEXT: popq %rbx
+; FALLBACK3-NEXT: popq %r14
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: shl_64bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %r15
+; FALLBACK4-NEXT: pushq %r14
+; FALLBACK4-NEXT: pushq %r13
+; FALLBACK4-NEXT: pushq %r12
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK4-NEXT: movl (%rsi), %ecx
+; FALLBACK4-NEXT: xorps %xmm4, %xmm4
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: leal (,%rcx,8), %eax
+; FALLBACK4-NEXT: andl $56, %eax
+; FALLBACK4-NEXT: andl $56, %ecx
+; FALLBACK4-NEXT: negl %ecx
+; FALLBACK4-NEXT: movslq %ecx, %r9
+; FALLBACK4-NEXT: movq -32(%rsp,%r9), %rdi
+; FALLBACK4-NEXT: movq %rdi, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: movq -40(%rsp,%r9), %rbx
+; FALLBACK4-NEXT: movq %rbx, %r8
+; FALLBACK4-NEXT: shrq %r8
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: orq %r10, %r8
+; FALLBACK4-NEXT: movq -24(%rsp,%r9), %r10
+; FALLBACK4-NEXT: movq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r11
+; FALLBACK4-NEXT: shrq %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rdi
+; FALLBACK4-NEXT: orq %r11, %rdi
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r15
+; FALLBACK4-NEXT: movq %r15, %r11
+; FALLBACK4-NEXT: shrq %r11
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: orq %rbx, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r15
+; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r14
+; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r12
+; FALLBACK4-NEXT: movq %r12, %rbx
+; FALLBACK4-NEXT: shrq %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbx
+; FALLBACK4-NEXT: orq %r15, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r12
+; FALLBACK4-NEXT: movq %r14, %r15
+; FALLBACK4-NEXT: shrq %r15
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r15
+; FALLBACK4-NEXT: orq %r12, %r15
+; FALLBACK4-NEXT: movq -16(%rsp,%r9), %r12
+; FALLBACK4-NEXT: movq %r12, %r13
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r13
+; FALLBACK4-NEXT: shrq %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: orq %r13, %r10
+; FALLBACK4-NEXT: movq -8(%rsp,%r9), %r9
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r9
+; FALLBACK4-NEXT: shrq %r12
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r12
+; FALLBACK4-NEXT: orq %r9, %r12
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r14
+; FALLBACK4-NEXT: movq %r14, (%rdx)
+; FALLBACK4-NEXT: movq %r12, 56(%rdx)
+; FALLBACK4-NEXT: movq %r10, 48(%rdx)
+; FALLBACK4-NEXT: movq %r15, 8(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT: movq %r11, 24(%rdx)
+; FALLBACK4-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK4-NEXT: movq %r8, 32(%rdx)
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: popq %r12
+; FALLBACK4-NEXT: popq %r13
+; FALLBACK4-NEXT: popq %r14
+; FALLBACK4-NEXT: popq %r15
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: shl_64bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: pushq %r15
+; FALLBACK5-NEXT: pushq %r14
+; FALLBACK5-NEXT: pushq %rbx
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK5-NEXT: movl (%rsi), %eax
+; FALLBACK5-NEXT: xorps %xmm4, %xmm4
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: andl $56, %ecx
+; FALLBACK5-NEXT: andl $56, %eax
+; FALLBACK5-NEXT: negl %eax
+; FALLBACK5-NEXT: movslq %eax, %r8
+; FALLBACK5-NEXT: movq -40(%rsp,%r8), %rax
+; FALLBACK5-NEXT: movq -32(%rsp,%r8), %r9
+; FALLBACK5-NEXT: movq %r9, %rsi
+; FALLBACK5-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK5-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK5-NEXT: movq %r10, %rdi
+; FALLBACK5-NEXT: shldq %cl, %r9, %rdi
+; FALLBACK5-NEXT: movq -48(%rsp,%r8), %r9
+; FALLBACK5-NEXT: shldq %cl, %r9, %rax
+; FALLBACK5-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK5-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK5-NEXT: shldq %cl, %rbx, %r9
+; FALLBACK5-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK5-NEXT: movq %r14, %r15
+; FALLBACK5-NEXT: shldq %cl, %r10, %r15
+; FALLBACK5-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK5-NEXT: shldq %cl, %r14, %r8
+; FALLBACK5-NEXT: movq %r11, %r10
+; FALLBACK5-NEXT: shlq %cl, %r10
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK5-NEXT: movq %r8, 56(%rdx)
+; FALLBACK5-NEXT: movq %r15, 48(%rdx)
+; FALLBACK5-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK5-NEXT: movq %r9, 16(%rdx)
+; FALLBACK5-NEXT: movq %rax, 24(%rdx)
+; FALLBACK5-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 32(%rdx)
+; FALLBACK5-NEXT: movq %r10, (%rdx)
+; FALLBACK5-NEXT: popq %rbx
+; FALLBACK5-NEXT: popq %r14
+; FALLBACK5-NEXT: popq %r15
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: shl_64bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: pushq %rbp
+; FALLBACK6-NEXT: pushq %r15
+; FALLBACK6-NEXT: pushq %r14
+; FALLBACK6-NEXT: pushq %r13
+; FALLBACK6-NEXT: pushq %r12
+; FALLBACK6-NEXT: pushq %rbx
+; FALLBACK6-NEXT: pushq %rax
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK6-NEXT: movl (%rsi), %ecx
+; FALLBACK6-NEXT: xorps %xmm4, %xmm4
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: leal (,%rcx,8), %eax
+; FALLBACK6-NEXT: andl $56, %eax
+; FALLBACK6-NEXT: andl $56, %ecx
+; FALLBACK6-NEXT: negl %ecx
+; FALLBACK6-NEXT: movslq %ecx, %rsi
+; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r8
+; FALLBACK6-NEXT: shlxq %rax, %r8, %r12
+; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %rdi
+; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %r13
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %r9
+; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rbx
+; FALLBACK6-NEXT: shlxq %rax, %rbx, %r11
+; FALLBACK6-NEXT: movq -56(%rsp,%rsi), %r15
+; FALLBACK6-NEXT: shlxq %rax, %r15, %r14
+; FALLBACK6-NEXT: movl %eax, %r10d
+; FALLBACK6-NEXT: notb %r10b
+; FALLBACK6-NEXT: shrq %rdi
+; FALLBACK6-NEXT: shrxq %r10, %rdi, %rdi
+; FALLBACK6-NEXT: orq %r12, %rdi
+; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %r12
+; FALLBACK6-NEXT: shlxq %rax, %r12, %rbp
+; FALLBACK6-NEXT: shrq %r8
+; FALLBACK6-NEXT: shrxq %r10, %r8, %r8
+; FALLBACK6-NEXT: orq %r13, %r8
+; FALLBACK6-NEXT: shlxq %rax, -8(%rsp,%rsi), %r13
+; FALLBACK6-NEXT: movq -64(%rsp,%rsi), %rsi
+; FALLBACK6-NEXT: shlxq %rax, %rsi, %rax
+; FALLBACK6-NEXT: shrq %rbx
+; FALLBACK6-NEXT: shrxq %r10, %rbx, %rbx
+; FALLBACK6-NEXT: orq %r9, %rbx
+; FALLBACK6-NEXT: shrq %r15
+; FALLBACK6-NEXT: shrxq %r10, %r15, %r9
+; FALLBACK6-NEXT: orq %r11, %r9
+; FALLBACK6-NEXT: shrq %rsi
+; FALLBACK6-NEXT: shrxq %r10, %rsi, %rsi
+; FALLBACK6-NEXT: orq %r14, %rsi
+; FALLBACK6-NEXT: shrq %rcx
+; FALLBACK6-NEXT: shrxq %r10, %rcx, %rcx
+; FALLBACK6-NEXT: orq %rbp, %rcx
+; FALLBACK6-NEXT: shrq %r12
+; FALLBACK6-NEXT: shrxq %r10, %r12, %r10
+; FALLBACK6-NEXT: orq %r13, %r10
+; FALLBACK6-NEXT: movq %rax, (%rdx)
+; FALLBACK6-NEXT: movq %r10, 56(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 48(%rdx)
+; FALLBACK6-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK6-NEXT: movq %r9, 16(%rdx)
+; FALLBACK6-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK6-NEXT: movq %r8, 40(%rdx)
+; FALLBACK6-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK6-NEXT: addq $8, %rsp
+; FALLBACK6-NEXT: popq %rbx
+; FALLBACK6-NEXT: popq %r12
+; FALLBACK6-NEXT: popq %r13
+; FALLBACK6-NEXT: popq %r14
+; FALLBACK6-NEXT: popq %r15
+; FALLBACK6-NEXT: popq %rbp
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: shl_64bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: pushq %r15
+; FALLBACK7-NEXT: pushq %r14
+; FALLBACK7-NEXT: pushq %rbx
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK7-NEXT: movl (%rsi), %eax
+; FALLBACK7-NEXT: xorps %xmm4, %xmm4
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: andl $56, %ecx
+; FALLBACK7-NEXT: andl $56, %eax
+; FALLBACK7-NEXT: negl %eax
+; FALLBACK7-NEXT: movslq %eax, %r8
+; FALLBACK7-NEXT: movq -40(%rsp,%r8), %rax
+; FALLBACK7-NEXT: movq -32(%rsp,%r8), %r9
+; FALLBACK7-NEXT: movq %r9, %rsi
+; FALLBACK7-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK7-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK7-NEXT: movq %r10, %rdi
+; FALLBACK7-NEXT: shldq %cl, %r9, %rdi
+; FALLBACK7-NEXT: movq -48(%rsp,%r8), %r9
+; FALLBACK7-NEXT: shldq %cl, %r9, %rax
+; FALLBACK7-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK7-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK7-NEXT: shldq %cl, %rbx, %r9
+; FALLBACK7-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK7-NEXT: movq %r14, %r15
+; FALLBACK7-NEXT: shldq %cl, %r10, %r15
+; FALLBACK7-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK7-NEXT: shldq %cl, %r14, %r8
+; FALLBACK7-NEXT: shlxq %rcx, %r11, %r10
+; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK7-NEXT: movq %r8, 56(%rdx)
+; FALLBACK7-NEXT: movq %r15, 48(%rdx)
+; FALLBACK7-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK7-NEXT: movq %r9, 16(%rdx)
+; FALLBACK7-NEXT: movq %rax, 24(%rdx)
+; FALLBACK7-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK7-NEXT: movq %rsi, 32(%rdx)
+; FALLBACK7-NEXT: movq %r10, (%rdx)
+; FALLBACK7-NEXT: popq %rbx
+; FALLBACK7-NEXT: popq %r14
+; FALLBACK7-NEXT: popq %r15
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: shl_64bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %r15
+; FALLBACK8-NEXT: pushq %r14
+; FALLBACK8-NEXT: pushq %r13
+; FALLBACK8-NEXT: pushq %r12
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK8-NEXT: movl (%rsi), %ecx
+; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: leal (,%rcx,8), %eax
+; FALLBACK8-NEXT: andl $56, %eax
+; FALLBACK8-NEXT: andl $56, %ecx
+; FALLBACK8-NEXT: negl %ecx
+; FALLBACK8-NEXT: movslq %ecx, %r9
+; FALLBACK8-NEXT: movq -32(%rsp,%r9), %rdi
+; FALLBACK8-NEXT: movq %rdi, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: movq -40(%rsp,%r9), %rbx
+; FALLBACK8-NEXT: movq %rbx, %r8
+; FALLBACK8-NEXT: shrq %r8
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: orq %r10, %r8
+; FALLBACK8-NEXT: movq -24(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r11
+; FALLBACK8-NEXT: shrq %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rdi
+; FALLBACK8-NEXT: orq %r11, %rdi
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r15
+; FALLBACK8-NEXT: movq %r15, %r11
+; FALLBACK8-NEXT: shrq %r11
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: orq %rbx, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r15
+; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r14
+; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r12
+; FALLBACK8-NEXT: movq %r12, %rbx
+; FALLBACK8-NEXT: shrq %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbx
+; FALLBACK8-NEXT: orq %r15, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r12
+; FALLBACK8-NEXT: movq %r14, %r15
+; FALLBACK8-NEXT: shrq %r15
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r15
+; FALLBACK8-NEXT: orq %r12, %r15
+; FALLBACK8-NEXT: movq -16(%rsp,%r9), %r12
+; FALLBACK8-NEXT: movq %r12, %r13
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r13
+; FALLBACK8-NEXT: shrq %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: orq %r13, %r10
+; FALLBACK8-NEXT: movq -8(%rsp,%r9), %r9
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r9
+; FALLBACK8-NEXT: shrq %r12
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r12
+; FALLBACK8-NEXT: orq %r9, %r12
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r14
+; FALLBACK8-NEXT: movq %r14, (%rdx)
+; FALLBACK8-NEXT: movq %r12, 56(%rdx)
+; FALLBACK8-NEXT: movq %r10, 48(%rdx)
+; FALLBACK8-NEXT: movq %r15, 8(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT: movq %r11, 24(%rdx)
+; FALLBACK8-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK8-NEXT: movq %r8, 32(%rdx)
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: popq %r12
+; FALLBACK8-NEXT: popq %r13
+; FALLBACK8-NEXT: popq %r14
+; FALLBACK8-NEXT: popq %r15
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: shl_64bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: pushq %r15
+; FALLBACK9-NEXT: pushq %r14
+; FALLBACK9-NEXT: pushq %rbx
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK9-NEXT: movl (%rsi), %eax
+; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: andl $56, %ecx
+; FALLBACK9-NEXT: andl $56, %eax
+; FALLBACK9-NEXT: negl %eax
+; FALLBACK9-NEXT: movslq %eax, %r8
+; FALLBACK9-NEXT: movq -40(%rsp,%r8), %rax
+; FALLBACK9-NEXT: movq -32(%rsp,%r8), %r9
+; FALLBACK9-NEXT: movq %r9, %rsi
+; FALLBACK9-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK9-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK9-NEXT: movq %r10, %rdi
+; FALLBACK9-NEXT: shldq %cl, %r9, %rdi
+; FALLBACK9-NEXT: movq -48(%rsp,%r8), %r9
+; FALLBACK9-NEXT: shldq %cl, %r9, %rax
+; FALLBACK9-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK9-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK9-NEXT: shldq %cl, %rbx, %r9
+; FALLBACK9-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK9-NEXT: movq %r14, %r15
+; FALLBACK9-NEXT: shldq %cl, %r10, %r15
+; FALLBACK9-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK9-NEXT: shldq %cl, %r14, %r8
+; FALLBACK9-NEXT: movq %r11, %r10
+; FALLBACK9-NEXT: shlq %cl, %r10
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK9-NEXT: movq %r8, 56(%rdx)
+; FALLBACK9-NEXT: movq %r15, 48(%rdx)
+; FALLBACK9-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK9-NEXT: movq %r9, 16(%rdx)
+; FALLBACK9-NEXT: movq %rax, 24(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 32(%rdx)
+; FALLBACK9-NEXT: movq %r10, (%rdx)
+; FALLBACK9-NEXT: popq %rbx
+; FALLBACK9-NEXT: popq %r14
+; FALLBACK9-NEXT: popq %r15
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: shl_64bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: pushq %rbp
+; FALLBACK10-NEXT: pushq %r15
+; FALLBACK10-NEXT: pushq %r14
+; FALLBACK10-NEXT: pushq %r13
+; FALLBACK10-NEXT: pushq %r12
+; FALLBACK10-NEXT: pushq %rbx
+; FALLBACK10-NEXT: pushq %rax
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK10-NEXT: movl (%rsi), %ecx
+; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: leal (,%rcx,8), %eax
+; FALLBACK10-NEXT: andl $56, %eax
+; FALLBACK10-NEXT: andl $56, %ecx
+; FALLBACK10-NEXT: negl %ecx
+; FALLBACK10-NEXT: movslq %ecx, %rsi
+; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r8
+; FALLBACK10-NEXT: shlxq %rax, %r8, %r12
+; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %rdi
+; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %r13
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %r9
+; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rbx
+; FALLBACK10-NEXT: shlxq %rax, %rbx, %r11
+; FALLBACK10-NEXT: movq -56(%rsp,%rsi), %r15
+; FALLBACK10-NEXT: shlxq %rax, %r15, %r14
+; FALLBACK10-NEXT: movl %eax, %r10d
+; FALLBACK10-NEXT: notb %r10b
+; FALLBACK10-NEXT: shrq %rdi
+; FALLBACK10-NEXT: shrxq %r10, %rdi, %rdi
+; FALLBACK10-NEXT: orq %r12, %rdi
+; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %r12
+; FALLBACK10-NEXT: shlxq %rax, %r12, %rbp
+; FALLBACK10-NEXT: shrq %r8
+; FALLBACK10-NEXT: shrxq %r10, %r8, %r8
+; FALLBACK10-NEXT: orq %r13, %r8
+; FALLBACK10-NEXT: shlxq %rax, -8(%rsp,%rsi), %r13
+; FALLBACK10-NEXT: movq -64(%rsp,%rsi), %rsi
+; FALLBACK10-NEXT: shlxq %rax, %rsi, %rax
+; FALLBACK10-NEXT: shrq %rbx
+; FALLBACK10-NEXT: shrxq %r10, %rbx, %rbx
+; FALLBACK10-NEXT: orq %r9, %rbx
+; FALLBACK10-NEXT: shrq %r15
+; FALLBACK10-NEXT: shrxq %r10, %r15, %r9
+; FALLBACK10-NEXT: orq %r11, %r9
+; FALLBACK10-NEXT: shrq %rsi
+; FALLBACK10-NEXT: shrxq %r10, %rsi, %rsi
+; FALLBACK10-NEXT: orq %r14, %rsi
+; FALLBACK10-NEXT: shrq %rcx
+; FALLBACK10-NEXT: shrxq %r10, %rcx, %rcx
+; FALLBACK10-NEXT: orq %rbp, %rcx
+; FALLBACK10-NEXT: shrq %r12
+; FALLBACK10-NEXT: shrxq %r10, %r12, %r10
+; FALLBACK10-NEXT: orq %r13, %r10
+; FALLBACK10-NEXT: movq %rax, (%rdx)
+; FALLBACK10-NEXT: movq %r10, 56(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 48(%rdx)
+; FALLBACK10-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK10-NEXT: movq %r9, 16(%rdx)
+; FALLBACK10-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK10-NEXT: movq %r8, 40(%rdx)
+; FALLBACK10-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK10-NEXT: addq $8, %rsp
+; FALLBACK10-NEXT: popq %rbx
+; FALLBACK10-NEXT: popq %r12
+; FALLBACK10-NEXT: popq %r13
+; FALLBACK10-NEXT: popq %r14
+; FALLBACK10-NEXT: popq %r15
+; FALLBACK10-NEXT: popq %rbp
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: shl_64bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: pushq %r15
+; FALLBACK11-NEXT: pushq %r14
+; FALLBACK11-NEXT: pushq %rbx
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK11-NEXT: movl (%rsi), %eax
+; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: andl $56, %ecx
+; FALLBACK11-NEXT: andl $56, %eax
+; FALLBACK11-NEXT: negl %eax
+; FALLBACK11-NEXT: movslq %eax, %r8
+; FALLBACK11-NEXT: movq -40(%rsp,%r8), %rax
+; FALLBACK11-NEXT: movq -32(%rsp,%r8), %r9
+; FALLBACK11-NEXT: movq %r9, %rsi
+; FALLBACK11-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK11-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK11-NEXT: movq %r10, %rdi
+; FALLBACK11-NEXT: shldq %cl, %r9, %rdi
+; FALLBACK11-NEXT: movq -48(%rsp,%r8), %r9
+; FALLBACK11-NEXT: shldq %cl, %r9, %rax
+; FALLBACK11-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK11-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK11-NEXT: shldq %cl, %rbx, %r9
+; FALLBACK11-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK11-NEXT: movq %r14, %r15
+; FALLBACK11-NEXT: shldq %cl, %r10, %r15
+; FALLBACK11-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK11-NEXT: shldq %cl, %r14, %r8
+; FALLBACK11-NEXT: shlxq %rcx, %r11, %r10
+; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK11-NEXT: movq %r8, 56(%rdx)
+; FALLBACK11-NEXT: movq %r15, 48(%rdx)
+; FALLBACK11-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK11-NEXT: movq %r9, 16(%rdx)
+; FALLBACK11-NEXT: movq %rax, 24(%rdx)
+; FALLBACK11-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK11-NEXT: movq %rsi, 32(%rdx)
+; FALLBACK11-NEXT: movq %r10, (%rdx)
+; FALLBACK11-NEXT: popq %rbx
+; FALLBACK11-NEXT: popq %r14
+; FALLBACK11-NEXT: popq %r15
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: shl_64bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %r15
+; FALLBACK12-NEXT: pushq %r14
+; FALLBACK12-NEXT: pushq %r13
+; FALLBACK12-NEXT: pushq %r12
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK12-NEXT: movl (%rsi), %ecx
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: leal (,%rcx,8), %eax
+; FALLBACK12-NEXT: andl $56, %eax
+; FALLBACK12-NEXT: andl $56, %ecx
+; FALLBACK12-NEXT: negl %ecx
+; FALLBACK12-NEXT: movslq %ecx, %r9
+; FALLBACK12-NEXT: movq -32(%rsp,%r9), %rdi
+; FALLBACK12-NEXT: movq %rdi, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: movq -40(%rsp,%r9), %rbx
+; FALLBACK12-NEXT: movq %rbx, %r8
+; FALLBACK12-NEXT: shrq %r8
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: orq %r10, %r8
+; FALLBACK12-NEXT: movq -24(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r11
+; FALLBACK12-NEXT: shrq %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rdi
+; FALLBACK12-NEXT: orq %r11, %rdi
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r15
+; FALLBACK12-NEXT: movq %r15, %r11
+; FALLBACK12-NEXT: shrq %r11
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: orq %rbx, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r15
+; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r14
+; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r12
+; FALLBACK12-NEXT: movq %r12, %rbx
+; FALLBACK12-NEXT: shrq %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbx
+; FALLBACK12-NEXT: orq %r15, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r12
+; FALLBACK12-NEXT: movq %r14, %r15
+; FALLBACK12-NEXT: shrq %r15
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r15
+; FALLBACK12-NEXT: orq %r12, %r15
+; FALLBACK12-NEXT: movq -16(%rsp,%r9), %r12
+; FALLBACK12-NEXT: movq %r12, %r13
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r13
+; FALLBACK12-NEXT: shrq %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: orq %r13, %r10
+; FALLBACK12-NEXT: movq -8(%rsp,%r9), %r9
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r9
+; FALLBACK12-NEXT: shrq %r12
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r12
+; FALLBACK12-NEXT: orq %r9, %r12
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r14
+; FALLBACK12-NEXT: movq %r14, (%rdx)
+; FALLBACK12-NEXT: movq %r12, 56(%rdx)
+; FALLBACK12-NEXT: movq %r10, 48(%rdx)
+; FALLBACK12-NEXT: movq %r15, 8(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT: movq %r11, 24(%rdx)
+; FALLBACK12-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK12-NEXT: movq %r8, 32(%rdx)
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: popq %r12
+; FALLBACK12-NEXT: popq %r13
+; FALLBACK12-NEXT: popq %r14
+; FALLBACK12-NEXT: popq %r15
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: shl_64bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: pushq %r15
+; FALLBACK13-NEXT: pushq %r14
+; FALLBACK13-NEXT: pushq %rbx
+; FALLBACK13-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK13-NEXT: movl (%rsi), %eax
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: leal (,%rax,8), %ecx
+; FALLBACK13-NEXT: andl $56, %ecx
+; FALLBACK13-NEXT: andl $56, %eax
+; FALLBACK13-NEXT: negl %eax
+; FALLBACK13-NEXT: movslq %eax, %r8
+; FALLBACK13-NEXT: movq -40(%rsp,%r8), %rax
+; FALLBACK13-NEXT: movq -32(%rsp,%r8), %r9
+; FALLBACK13-NEXT: movq %r9, %rsi
+; FALLBACK13-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK13-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK13-NEXT: movq %r10, %rdi
+; FALLBACK13-NEXT: shldq %cl, %r9, %rdi
+; FALLBACK13-NEXT: movq -48(%rsp,%r8), %r9
+; FALLBACK13-NEXT: shldq %cl, %r9, %rax
+; FALLBACK13-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK13-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK13-NEXT: shldq %cl, %rbx, %r9
+; FALLBACK13-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK13-NEXT: movq %r14, %r15
+; FALLBACK13-NEXT: shldq %cl, %r10, %r15
+; FALLBACK13-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK13-NEXT: shldq %cl, %r14, %r8
+; FALLBACK13-NEXT: movq %r11, %r10
+; FALLBACK13-NEXT: shlq %cl, %r10
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK13-NEXT: movq %r8, 56(%rdx)
+; FALLBACK13-NEXT: movq %r15, 48(%rdx)
+; FALLBACK13-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK13-NEXT: movq %r9, 16(%rdx)
+; FALLBACK13-NEXT: movq %rax, 24(%rdx)
+; FALLBACK13-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 32(%rdx)
+; FALLBACK13-NEXT: movq %r10, (%rdx)
+; FALLBACK13-NEXT: popq %rbx
+; FALLBACK13-NEXT: popq %r14
+; FALLBACK13-NEXT: popq %r15
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: shl_64bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: pushq %rbp
+; FALLBACK14-NEXT: pushq %r15
+; FALLBACK14-NEXT: pushq %r14
+; FALLBACK14-NEXT: pushq %r13
+; FALLBACK14-NEXT: pushq %r12
+; FALLBACK14-NEXT: pushq %rbx
+; FALLBACK14-NEXT: pushq %rax
+; FALLBACK14-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK14-NEXT: movl (%rsi), %ecx
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: leal (,%rcx,8), %eax
+; FALLBACK14-NEXT: andl $56, %eax
+; FALLBACK14-NEXT: andl $56, %ecx
+; FALLBACK14-NEXT: negl %ecx
+; FALLBACK14-NEXT: movslq %ecx, %rsi
+; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r8
+; FALLBACK14-NEXT: shlxq %rax, %r8, %r12
+; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %r13
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %r9
+; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rbx
+; FALLBACK14-NEXT: shlxq %rax, %rbx, %r11
+; FALLBACK14-NEXT: movq -56(%rsp,%rsi), %r15
+; FALLBACK14-NEXT: shlxq %rax, %r15, %r14
+; FALLBACK14-NEXT: movl %eax, %r10d
+; FALLBACK14-NEXT: notb %r10b
+; FALLBACK14-NEXT: shrq %rdi
+; FALLBACK14-NEXT: shrxq %r10, %rdi, %rdi
+; FALLBACK14-NEXT: orq %r12, %rdi
+; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %r12
+; FALLBACK14-NEXT: shlxq %rax, %r12, %rbp
+; FALLBACK14-NEXT: shrq %r8
+; FALLBACK14-NEXT: shrxq %r10, %r8, %r8
+; FALLBACK14-NEXT: orq %r13, %r8
+; FALLBACK14-NEXT: shlxq %rax, -8(%rsp,%rsi), %r13
+; FALLBACK14-NEXT: movq -64(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT: shlxq %rax, %rsi, %rax
+; FALLBACK14-NEXT: shrq %rbx
+; FALLBACK14-NEXT: shrxq %r10, %rbx, %rbx
+; FALLBACK14-NEXT: orq %r9, %rbx
+; FALLBACK14-NEXT: shrq %r15
+; FALLBACK14-NEXT: shrxq %r10, %r15, %r9
+; FALLBACK14-NEXT: orq %r11, %r9
+; FALLBACK14-NEXT: shrq %rsi
+; FALLBACK14-NEXT: shrxq %r10, %rsi, %rsi
+; FALLBACK14-NEXT: orq %r14, %rsi
+; FALLBACK14-NEXT: shrq %rcx
+; FALLBACK14-NEXT: shrxq %r10, %rcx, %rcx
+; FALLBACK14-NEXT: orq %rbp, %rcx
+; FALLBACK14-NEXT: shrq %r12
+; FALLBACK14-NEXT: shrxq %r10, %r12, %r10
+; FALLBACK14-NEXT: orq %r13, %r10
+; FALLBACK14-NEXT: movq %rax, (%rdx)
+; FALLBACK14-NEXT: movq %r10, 56(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 48(%rdx)
+; FALLBACK14-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK14-NEXT: movq %r9, 16(%rdx)
+; FALLBACK14-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK14-NEXT: movq %r8, 40(%rdx)
+; FALLBACK14-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK14-NEXT: addq $8, %rsp
+; FALLBACK14-NEXT: popq %rbx
+; FALLBACK14-NEXT: popq %r12
+; FALLBACK14-NEXT: popq %r13
+; FALLBACK14-NEXT: popq %r14
+; FALLBACK14-NEXT: popq %r15
+; FALLBACK14-NEXT: popq %rbp
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: shl_64bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: pushq %r15
+; FALLBACK15-NEXT: pushq %r14
+; FALLBACK15-NEXT: pushq %rbx
+; FALLBACK15-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK15-NEXT: movl (%rsi), %eax
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: andl $56, %ecx
+; FALLBACK15-NEXT: andl $56, %eax
+; FALLBACK15-NEXT: negl %eax
+; FALLBACK15-NEXT: movslq %eax, %r8
+; FALLBACK15-NEXT: movq -40(%rsp,%r8), %rax
+; FALLBACK15-NEXT: movq -32(%rsp,%r8), %r9
+; FALLBACK15-NEXT: movq %r9, %rsi
+; FALLBACK15-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK15-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK15-NEXT: movq %r10, %rdi
+; FALLBACK15-NEXT: shldq %cl, %r9, %rdi
+; FALLBACK15-NEXT: movq -48(%rsp,%r8), %r9
+; FALLBACK15-NEXT: shldq %cl, %r9, %rax
+; FALLBACK15-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK15-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK15-NEXT: shldq %cl, %rbx, %r9
+; FALLBACK15-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK15-NEXT: movq %r14, %r15
+; FALLBACK15-NEXT: shldq %cl, %r10, %r15
+; FALLBACK15-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK15-NEXT: shldq %cl, %r14, %r8
+; FALLBACK15-NEXT: shlxq %rcx, %r11, %r10
+; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK15-NEXT: movq %r8, 56(%rdx)
+; FALLBACK15-NEXT: movq %r15, 48(%rdx)
+; FALLBACK15-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK15-NEXT: movq %r9, 16(%rdx)
+; FALLBACK15-NEXT: movq %rax, 24(%rdx)
+; FALLBACK15-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK15-NEXT: movq %rsi, 32(%rdx)
+; FALLBACK15-NEXT: movq %r10, (%rdx)
+; FALLBACK15-NEXT: popq %rbx
+; FALLBACK15-NEXT: popq %r14
+; FALLBACK15-NEXT: popq %r15
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: shl_64bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $204, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl (%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 16(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 20(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 24(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 28(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 36(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 40(%eax), %ebp
+; FALLBACK16-NEXT: movl 44(%eax), %ebx
+; FALLBACK16-NEXT: movl 48(%eax), %edi
+; FALLBACK16-NEXT: movl 52(%eax), %esi
+; FALLBACK16-NEXT: movl 56(%eax), %edx
+; FALLBACK16-NEXT: movl 60(%eax), %ecx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl (%eax), %eax
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %eax, %edx
+; FALLBACK16-NEXT: andl $60, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: subl %edx, %ecx
+; FALLBACK16-NEXT: movl (%ecx), %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%ecx), %edx
+; FALLBACK16-NEXT: movl %ecx, %ebp
+; FALLBACK16-NEXT: shll $3, %eax
+; FALLBACK16-NEXT: andl $24, %eax
+; FALLBACK16-NEXT: movl %edx, %esi
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %al, %ch
+; FALLBACK16-NEXT: notb %ch
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %esi, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%ebp), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 8(%ebp), %esi
+; FALLBACK16-NEXT: movl %ebp, %edi
+; FALLBACK16-NEXT: movl %esi, %ebp
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %ebx, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %esi, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %edi, %ebp
+; FALLBACK16-NEXT: movl 20(%edi), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 16(%edi), %esi
+; FALLBACK16-NEXT: movl %esi, %edx
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %ebx, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %esi, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %ebp, %edx
+; FALLBACK16-NEXT: movl 28(%ebp), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 24(%ebp), %esi
+; FALLBACK16-NEXT: movl %esi, %edi
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %ebx, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %esi, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 36(%edx), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 32(%edx), %esi
+; FALLBACK16-NEXT: movl %edx, %ebp
+; FALLBACK16-NEXT: movl %esi, %edi
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %ebx, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %esi, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 44(%ebp), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 40(%ebp), %esi
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %esi, %edx
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %ebx, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %esi, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 52(%ebp), %esi
+; FALLBACK16-NEXT: movl %esi, %edi
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: negl %edx
+; FALLBACK16-NEXT: movl 176(%esp,%edx), %ebx
+; FALLBACK16-NEXT: movl %ebx, %ebp
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %edi, %ebp
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %ebx, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK16-NEXT: movl 60(%edi), %edx
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: movl 56(%edi), %ebx
+; FALLBACK16-NEXT: movl %ebx, %edi
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %edx, %edi
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: shrl %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: orl %ebx, %esi
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl %edx, (%eax)
+; FALLBACK16-NEXT: movl %esi, 56(%eax)
+; FALLBACK16-NEXT: movl %edi, 60(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 48(%eax)
+; FALLBACK16-NEXT: movl %ebp, 52(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 40(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 44(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 32(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 36(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 24(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 28(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 16(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 20(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 8(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 12(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 4(%eax)
+; FALLBACK16-NEXT: addl $204, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: shl_64bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $188, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 12(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 20(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 24(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 28(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 36(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%ecx), %ebp
+; FALLBACK17-NEXT: movl 44(%ecx), %ebx
+; FALLBACK17-NEXT: movl 48(%ecx), %edi
+; FALLBACK17-NEXT: movl 52(%ecx), %esi
+; FALLBACK17-NEXT: movl 56(%ecx), %edx
+; FALLBACK17-NEXT: movl 60(%ecx), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %ecx
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ecx, %ebp
+; FALLBACK17-NEXT: andl $60, %ebp
+; FALLBACK17-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: subl %ebp, %eax
+; FALLBACK17-NEXT: movl 8(%eax), %esi
+; FALLBACK17-NEXT: movl 12(%eax), %edx
+; FALLBACK17-NEXT: shll $3, %ecx
+; FALLBACK17-NEXT: andl $24, %ecx
+; FALLBACK17-NEXT: movl %edx, %edi
+; FALLBACK17-NEXT: shldl %cl, %esi, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%eax), %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edi, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%eax), %edi
+; FALLBACK17-NEXT: movl 20(%eax), %esi
+; FALLBACK17-NEXT: movl %esi, %ebx
+; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edx, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 24(%eax), %edi
+; FALLBACK17-NEXT: movl 28(%eax), %edx
+; FALLBACK17-NEXT: movl %edx, %ebx
+; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %esi, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%eax), %edi
+; FALLBACK17-NEXT: movl 36(%eax), %esi
+; FALLBACK17-NEXT: movl %esi, %ebx
+; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edx, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%eax), %edx
+; FALLBACK17-NEXT: movl 44(%eax), %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edx, %edi
+; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 56(%eax), %edx
+; FALLBACK17-NEXT: movl 60(%eax), %edi
+; FALLBACK17-NEXT: shldl %cl, %edx, %edi
+; FALLBACK17-NEXT: movl (%eax), %ebx
+; FALLBACK17-NEXT: movl 52(%eax), %esi
+; FALLBACK17-NEXT: shldl %cl, %esi, %edx
+; FALLBACK17-NEXT: negl %ebp
+; FALLBACK17-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl %edx, 56(%ebp)
+; FALLBACK17-NEXT: movl %edi, 60(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK17-NEXT: shll %cl, %ebx
+; FALLBACK17-NEXT: shldl %cl, %eax, %esi
+; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK17-NEXT: shldl %cl, %edi, %eax
+; FALLBACK17-NEXT: movl %eax, 48(%ebp)
+; FALLBACK17-NEXT: movl %esi, 52(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 40(%ebp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 44(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 32(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 36(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 24(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 28(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 16(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 20(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 8(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 12(%ebp)
+; FALLBACK17-NEXT: movl %ebx, (%ebp)
+; FALLBACK17-NEXT: movl %edx, 4(%ebp)
+; FALLBACK17-NEXT: addl $188, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: shl_64bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $204, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 12(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 16(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 20(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 24(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 28(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 32(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 36(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 40(%eax), %ebx
+; FALLBACK18-NEXT: movl 44(%eax), %edi
+; FALLBACK18-NEXT: movl 48(%eax), %esi
+; FALLBACK18-NEXT: movl 52(%eax), %edx
+; FALLBACK18-NEXT: movl 56(%eax), %ecx
+; FALLBACK18-NEXT: movl 60(%eax), %eax
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK18-NEXT: movl (%ebp), %ebp
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: leal (,%ebp,8), %edx
+; FALLBACK18-NEXT: andl $24, %edx
+; FALLBACK18-NEXT: andl $60, %ebp
+; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edi
+; FALLBACK18-NEXT: subl %ebp, %edi
+; FALLBACK18-NEXT: movl (%edi), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%edi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl %edx, %ebx
+; FALLBACK18-NEXT: notb %bl
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %esi
+; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx
+; FALLBACK18-NEXT: orl %ecx, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%edi), %esi
+; FALLBACK18-NEXT: movl %esi, %ecx
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT: movl 12(%edi), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: shrl %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 16(%edi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT: movl 20(%edi), %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT: orl %eax, %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 24(%edi), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT: movl 28(%edi), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %eax, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 32(%edi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT: movl 36(%edi), %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT: orl %eax, %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 40(%edi), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT: movl 44(%edi), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %eax, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 48(%edi), %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: movl 52(%edi), %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ebp
+; FALLBACK18-NEXT: orl %eax, %ebp
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: negl %eax
+; FALLBACK18-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK18-NEXT: movl 56(%edi), %eax
+; FALLBACK18-NEXT: shlxl %edx, %eax, %edx
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %edx, %esi
+; FALLBACK18-NEXT: shrl %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT: orl %eax, %ecx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, (%eax)
+; FALLBACK18-NEXT: movl %esi, 56(%eax)
+; FALLBACK18-NEXT: movl %ecx, 60(%eax)
+; FALLBACK18-NEXT: movl %ebp, 48(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 52(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 40(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 44(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 32(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 36(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 24(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 28(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 16(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $204, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: shl_64bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $204, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl (%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 12(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 20(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 24(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 28(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 36(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%ebp), %ebx
+; FALLBACK19-NEXT: movl 44(%ebp), %edi
+; FALLBACK19-NEXT: movl 48(%ebp), %esi
+; FALLBACK19-NEXT: movl 52(%ebp), %edx
+; FALLBACK19-NEXT: movl 56(%ebp), %ecx
+; FALLBACK19-NEXT: movl 60(%ebp), %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl (%ebp), %ebp
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: leal (,%ebp,8), %ecx
+; FALLBACK19-NEXT: andl $24, %ecx
+; FALLBACK19-NEXT: andl $60, %ebp
+; FALLBACK19-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: subl %ebp, %eax
+; FALLBACK19-NEXT: movl 4(%eax), %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%eax), %edi
+; FALLBACK19-NEXT: movl 12(%eax), %edx
+; FALLBACK19-NEXT: movl %edx, %ebx
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %esi, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%eax), %edi
+; FALLBACK19-NEXT: movl 20(%eax), %esi
+; FALLBACK19-NEXT: movl %esi, %ebx
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 24(%eax), %edi
+; FALLBACK19-NEXT: movl 28(%eax), %edx
+; FALLBACK19-NEXT: movl %edx, %ebx
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %esi, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%eax), %edi
+; FALLBACK19-NEXT: movl 36(%eax), %esi
+; FALLBACK19-NEXT: movl %esi, %ebx
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%eax), %ebx
+; FALLBACK19-NEXT: movl 44(%eax), %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK19-NEXT: movl 56(%eax), %edx
+; FALLBACK19-NEXT: movl 60(%eax), %edi
+; FALLBACK19-NEXT: shldl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl (%eax), %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 52(%eax), %esi
+; FALLBACK19-NEXT: shldl %cl, %esi, %edx
+; FALLBACK19-NEXT: negl %ebp
+; FALLBACK19-NEXT: movl 176(%esp,%ebp), %ebp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: movl %edx, 56(%eax)
+; FALLBACK19-NEXT: movl %edi, 60(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: shlxl %ecx, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK19-NEXT: shldl %cl, %edx, %edi
+; FALLBACK19-NEXT: shldl %cl, %ebp, %esi
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK19-NEXT: movl %ebp, 48(%eax)
+; FALLBACK19-NEXT: movl %esi, 52(%eax)
+; FALLBACK19-NEXT: movl %ebx, 40(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 44(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 32(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 36(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 24(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 28(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 16(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 20(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 8(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 12(%eax)
+; FALLBACK19-NEXT: movl %edi, 4(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, (%eax)
+; FALLBACK19-NEXT: addl $204, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: shl_64bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $204, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK20-NEXT: movl (%eax), %eax
+; FALLBACK20-NEXT: xorps %xmm4, %xmm4
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: andl $60, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: subl %edx, %ecx
+; FALLBACK20-NEXT: movl (%ecx), %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 4(%ecx), %edx
+; FALLBACK20-NEXT: movl %ecx, %ebp
+; FALLBACK20-NEXT: shll $3, %eax
+; FALLBACK20-NEXT: andl $24, %eax
+; FALLBACK20-NEXT: movl %edx, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %al, %ch
+; FALLBACK20-NEXT: notb %ch
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %esi, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 12(%ebp), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 8(%ebp), %esi
+; FALLBACK20-NEXT: movl %ebp, %edi
+; FALLBACK20-NEXT: movl %esi, %ebp
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %esi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %edi, %ebp
+; FALLBACK20-NEXT: movl 20(%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 16(%edi), %esi
+; FALLBACK20-NEXT: movl %esi, %edx
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %esi, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %ebp, %edx
+; FALLBACK20-NEXT: movl 28(%ebp), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 24(%ebp), %esi
+; FALLBACK20-NEXT: movl %esi, %edi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %ebx, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %esi, %ebp
+; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 36(%edx), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 32(%edx), %esi
+; FALLBACK20-NEXT: movl %edx, %ebp
+; FALLBACK20-NEXT: movl %esi, %edi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %ebx, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %esi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 44(%ebp), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 40(%ebp), %esi
+; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %esi, %edx
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %esi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 52(%ebp), %esi
+; FALLBACK20-NEXT: movl %esi, %edi
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: negl %edx
+; FALLBACK20-NEXT: movl 176(%esp,%edx), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %edi, %ebp
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT: movl 60(%edi), %edx
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: movl 56(%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %edi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %edx, %edi
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: shrl %esi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %edx, (%eax)
+; FALLBACK20-NEXT: movl %esi, 56(%eax)
+; FALLBACK20-NEXT: movl %edi, 60(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 48(%eax)
+; FALLBACK20-NEXT: movl %ebp, 52(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 40(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 44(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 32(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 36(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 24(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 28(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 16(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 4(%eax)
+; FALLBACK20-NEXT: addl $204, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: shl_64bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $188, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movups (%ecx), %xmm0
+; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK21-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK21-NEXT: movl (%eax), %ecx
+; FALLBACK21-NEXT: xorps %xmm4, %xmm4
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %ecx, %ebp
+; FALLBACK21-NEXT: andl $60, %ebp
+; FALLBACK21-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: subl %ebp, %eax
+; FALLBACK21-NEXT: movl 8(%eax), %esi
+; FALLBACK21-NEXT: movl 12(%eax), %edx
+; FALLBACK21-NEXT: shll $3, %ecx
+; FALLBACK21-NEXT: andl $24, %ecx
+; FALLBACK21-NEXT: movl %edx, %edi
+; FALLBACK21-NEXT: shldl %cl, %esi, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 4(%eax), %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %edi, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 16(%eax), %edi
+; FALLBACK21-NEXT: movl 20(%eax), %esi
+; FALLBACK21-NEXT: movl %esi, %ebx
+; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 24(%eax), %edi
+; FALLBACK21-NEXT: movl 28(%eax), %edx
+; FALLBACK21-NEXT: movl %edx, %ebx
+; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %esi, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 32(%eax), %edi
+; FALLBACK21-NEXT: movl 36(%eax), %esi
+; FALLBACK21-NEXT: movl %esi, %ebx
+; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 40(%eax), %edx
+; FALLBACK21-NEXT: movl 44(%eax), %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT: movl 56(%eax), %edx
+; FALLBACK21-NEXT: movl 60(%eax), %edi
+; FALLBACK21-NEXT: shldl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl (%eax), %ebx
+; FALLBACK21-NEXT: movl 52(%eax), %esi
+; FALLBACK21-NEXT: shldl %cl, %esi, %edx
+; FALLBACK21-NEXT: negl %ebp
+; FALLBACK21-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %edx, 56(%ebp)
+; FALLBACK21-NEXT: movl %edi, 60(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK21-NEXT: shll %cl, %ebx
+; FALLBACK21-NEXT: shldl %cl, %eax, %esi
+; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK21-NEXT: shldl %cl, %edi, %eax
+; FALLBACK21-NEXT: movl %eax, 48(%ebp)
+; FALLBACK21-NEXT: movl %esi, 52(%ebp)
+; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 40(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 44(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 32(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 36(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 24(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %ebx, (%ebp)
+; FALLBACK21-NEXT: movl %edx, 4(%ebp)
+; FALLBACK21-NEXT: addl $188, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: shl_64bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $204, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK22-NEXT: movl (%eax), %eax
+; FALLBACK22-NEXT: xorps %xmm4, %xmm4
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: leal (,%eax,8), %edx
+; FALLBACK22-NEXT: andl $24, %edx
+; FALLBACK22-NEXT: andl $60, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edi
+; FALLBACK22-NEXT: subl %eax, %edi
+; FALLBACK22-NEXT: movl (%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 4(%edi), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl %edx, %ebx
+; FALLBACK22-NEXT: notb %bl
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %esi
+; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 8(%edi), %esi
+; FALLBACK22-NEXT: movl %esi, %ecx
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT: movl 12(%edi), %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: shrl %eax
+; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 16(%edi), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %eax
+; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT: movl 20(%edi), %esi
+; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %eax, %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 24(%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT: movl 28(%edi), %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %eax, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 32(%edi), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %eax
+; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT: movl 36(%edi), %esi
+; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %eax, %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 40(%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT: movl 44(%edi), %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %eax, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 48(%edi), %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax
+; FALLBACK22-NEXT: movl 52(%edi), %esi
+; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ebp
+; FALLBACK22-NEXT: orl %eax, %ebp
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: negl %eax
+; FALLBACK22-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK22-NEXT: movl 56(%edi), %eax
+; FALLBACK22-NEXT: shlxl %edx, %eax, %edx
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %edx, %esi
+; FALLBACK22-NEXT: shrl %eax
+; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT: orl %eax, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK22-NEXT: movl %edx, (%eax)
+; FALLBACK22-NEXT: movl %esi, 56(%eax)
+; FALLBACK22-NEXT: movl %ecx, 60(%eax)
+; FALLBACK22-NEXT: movl %ebp, 48(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 52(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 40(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 44(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 32(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 36(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 24(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 28(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 16(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 20(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 8(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 12(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 4(%eax)
+; FALLBACK22-NEXT: addl $204, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: shl_64bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $204, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movups (%ecx), %xmm0
+; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK23-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK23-NEXT: movl (%eax), %ebp
+; FALLBACK23-NEXT: xorps %xmm4, %xmm4
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: leal (,%ebp,8), %ecx
+; FALLBACK23-NEXT: andl $24, %ecx
+; FALLBACK23-NEXT: andl $60, %ebp
+; FALLBACK23-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: subl %ebp, %eax
+; FALLBACK23-NEXT: movl 4(%eax), %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 8(%eax), %edi
+; FALLBACK23-NEXT: movl 12(%eax), %edx
+; FALLBACK23-NEXT: movl %edx, %ebx
+; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %esi, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 16(%eax), %edi
+; FALLBACK23-NEXT: movl 20(%eax), %esi
+; FALLBACK23-NEXT: movl %esi, %ebx
+; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 24(%eax), %edi
+; FALLBACK23-NEXT: movl 28(%eax), %edx
+; FALLBACK23-NEXT: movl %edx, %ebx
+; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %esi, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 32(%eax), %edi
+; FALLBACK23-NEXT: movl 36(%eax), %esi
+; FALLBACK23-NEXT: movl %esi, %ebx
+; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 40(%eax), %ebx
+; FALLBACK23-NEXT: movl 44(%eax), %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK23-NEXT: movl 56(%eax), %edx
+; FALLBACK23-NEXT: movl 60(%eax), %edi
+; FALLBACK23-NEXT: shldl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl (%eax), %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 52(%eax), %esi
+; FALLBACK23-NEXT: shldl %cl, %esi, %edx
+; FALLBACK23-NEXT: negl %ebp
+; FALLBACK23-NEXT: movl 176(%esp,%ebp), %ebp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl %edx, 56(%eax)
+; FALLBACK23-NEXT: movl %edi, 60(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT: shlxl %ecx, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK23-NEXT: shldl %cl, %edx, %edi
+; FALLBACK23-NEXT: shldl %cl, %ebp, %esi
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK23-NEXT: movl %ebp, 48(%eax)
+; FALLBACK23-NEXT: movl %esi, 52(%eax)
+; FALLBACK23-NEXT: movl %ebx, 40(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 44(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 32(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 36(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 24(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 28(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 16(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 20(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 8(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 12(%eax)
+; FALLBACK23-NEXT: movl %edi, 4(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, (%eax)
+; FALLBACK23-NEXT: addl $204, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: shl_64bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $204, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK24-NEXT: movl (%eax), %eax
+; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: andl $60, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: subl %edx, %ecx
+; FALLBACK24-NEXT: movl (%ecx), %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 4(%ecx), %edx
+; FALLBACK24-NEXT: movl %ecx, %ebp
+; FALLBACK24-NEXT: shll $3, %eax
+; FALLBACK24-NEXT: andl $24, %eax
+; FALLBACK24-NEXT: movl %edx, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %al, %ch
+; FALLBACK24-NEXT: notb %ch
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %esi, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 12(%ebp), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 8(%ebp), %esi
+; FALLBACK24-NEXT: movl %ebp, %edi
+; FALLBACK24-NEXT: movl %esi, %ebp
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %esi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %edi, %ebp
+; FALLBACK24-NEXT: movl 20(%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 16(%edi), %esi
+; FALLBACK24-NEXT: movl %esi, %edx
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %esi, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %ebp, %edx
+; FALLBACK24-NEXT: movl 28(%ebp), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 24(%ebp), %esi
+; FALLBACK24-NEXT: movl %esi, %edi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %ebx, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %esi, %ebp
+; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 36(%edx), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 32(%edx), %esi
+; FALLBACK24-NEXT: movl %edx, %ebp
+; FALLBACK24-NEXT: movl %esi, %edi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %ebx, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %esi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 44(%ebp), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 40(%ebp), %esi
+; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %esi, %edx
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %esi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 52(%ebp), %esi
+; FALLBACK24-NEXT: movl %esi, %edi
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: negl %edx
+; FALLBACK24-NEXT: movl 176(%esp,%edx), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %edi, %ebp
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT: movl 60(%edi), %edx
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: movl 56(%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %edi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %edx, %edi
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: shrl %esi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %edx, (%eax)
+; FALLBACK24-NEXT: movl %esi, 56(%eax)
+; FALLBACK24-NEXT: movl %edi, 60(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 48(%eax)
+; FALLBACK24-NEXT: movl %ebp, 52(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 40(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 44(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 32(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 36(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 24(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 28(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 16(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 4(%eax)
+; FALLBACK24-NEXT: addl $204, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: shl_64bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $188, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK25-NEXT: movl (%eax), %ecx
+; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %ecx, %ebp
+; FALLBACK25-NEXT: andl $60, %ebp
+; FALLBACK25-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: subl %ebp, %eax
+; FALLBACK25-NEXT: movl 8(%eax), %esi
+; FALLBACK25-NEXT: movl 12(%eax), %edx
+; FALLBACK25-NEXT: shll $3, %ecx
+; FALLBACK25-NEXT: andl $24, %ecx
+; FALLBACK25-NEXT: movl %edx, %edi
+; FALLBACK25-NEXT: shldl %cl, %esi, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 4(%eax), %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %edi, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 16(%eax), %edi
+; FALLBACK25-NEXT: movl 20(%eax), %esi
+; FALLBACK25-NEXT: movl %esi, %ebx
+; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 24(%eax), %edi
+; FALLBACK25-NEXT: movl 28(%eax), %edx
+; FALLBACK25-NEXT: movl %edx, %ebx
+; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %esi, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 32(%eax), %edi
+; FALLBACK25-NEXT: movl 36(%eax), %esi
+; FALLBACK25-NEXT: movl %esi, %ebx
+; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 40(%eax), %edx
+; FALLBACK25-NEXT: movl 44(%eax), %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT: movl 56(%eax), %edx
+; FALLBACK25-NEXT: movl 60(%eax), %edi
+; FALLBACK25-NEXT: shldl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl (%eax), %ebx
+; FALLBACK25-NEXT: movl 52(%eax), %esi
+; FALLBACK25-NEXT: shldl %cl, %esi, %edx
+; FALLBACK25-NEXT: negl %ebp
+; FALLBACK25-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %edx, 56(%ebp)
+; FALLBACK25-NEXT: movl %edi, 60(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK25-NEXT: shll %cl, %ebx
+; FALLBACK25-NEXT: shldl %cl, %eax, %esi
+; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK25-NEXT: shldl %cl, %edi, %eax
+; FALLBACK25-NEXT: movl %eax, 48(%ebp)
+; FALLBACK25-NEXT: movl %esi, 52(%ebp)
+; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 40(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 44(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 32(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 36(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 24(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %ebx, (%ebp)
+; FALLBACK25-NEXT: movl %edx, 4(%ebp)
+; FALLBACK25-NEXT: addl $188, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: shl_64bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $204, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK26-NEXT: movl (%eax), %eax
+; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: leal (,%eax,8), %edx
+; FALLBACK26-NEXT: andl $24, %edx
+; FALLBACK26-NEXT: andl $60, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edi
+; FALLBACK26-NEXT: subl %eax, %edi
+; FALLBACK26-NEXT: movl (%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 4(%edi), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl %edx, %ebx
+; FALLBACK26-NEXT: notb %bl
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %esi
+; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx
+; FALLBACK26-NEXT: orl %ecx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 8(%edi), %esi
+; FALLBACK26-NEXT: movl %esi, %ecx
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT: movl 12(%edi), %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: shrl %eax
+; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 16(%edi), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %eax
+; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT: movl 20(%edi), %esi
+; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %eax, %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 24(%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT: movl 28(%edi), %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %eax, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 32(%edi), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %eax
+; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT: movl 36(%edi), %esi
+; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %eax, %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 40(%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT: movl 44(%edi), %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %eax, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 48(%edi), %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax
+; FALLBACK26-NEXT: movl 52(%edi), %esi
+; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ebp
+; FALLBACK26-NEXT: orl %eax, %ebp
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: negl %eax
+; FALLBACK26-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK26-NEXT: movl 56(%edi), %eax
+; FALLBACK26-NEXT: shlxl %edx, %eax, %edx
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %edx, %esi
+; FALLBACK26-NEXT: shrl %eax
+; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT: orl %eax, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK26-NEXT: movl %edx, (%eax)
+; FALLBACK26-NEXT: movl %esi, 56(%eax)
+; FALLBACK26-NEXT: movl %ecx, 60(%eax)
+; FALLBACK26-NEXT: movl %ebp, 48(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 52(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 40(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 44(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 32(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 36(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 24(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 28(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 16(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 20(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 8(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 12(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 4(%eax)
+; FALLBACK26-NEXT: addl $204, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: shl_64bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $204, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK27-NEXT: movl (%eax), %ebx
+; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: leal (,%ebx,8), %ecx
+; FALLBACK27-NEXT: andl $24, %ecx
+; FALLBACK27-NEXT: andl $60, %ebx
+; FALLBACK27-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: subl %ebx, %eax
+; FALLBACK27-NEXT: movl 4(%eax), %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 8(%eax), %edi
+; FALLBACK27-NEXT: movl 12(%eax), %edx
+; FALLBACK27-NEXT: movl %edx, %ebp
+; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %esi, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 16(%eax), %edi
+; FALLBACK27-NEXT: movl 20(%eax), %esi
+; FALLBACK27-NEXT: movl %esi, %ebp
+; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 24(%eax), %edi
+; FALLBACK27-NEXT: movl 28(%eax), %edx
+; FALLBACK27-NEXT: movl %edx, %ebp
+; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %esi, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 32(%eax), %edi
+; FALLBACK27-NEXT: movl 36(%eax), %esi
+; FALLBACK27-NEXT: movl %esi, %ebp
+; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 40(%eax), %ebp
+; FALLBACK27-NEXT: movl 44(%eax), %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %ebp, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %esi, %ebp
+; FALLBACK27-NEXT: movl 56(%eax), %edx
+; FALLBACK27-NEXT: movl 60(%eax), %edi
+; FALLBACK27-NEXT: shldl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl (%eax), %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 52(%eax), %esi
+; FALLBACK27-NEXT: shldl %cl, %esi, %edx
+; FALLBACK27-NEXT: negl %ebx
+; FALLBACK27-NEXT: movl 176(%esp,%ebx), %ebx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl %edx, 56(%eax)
+; FALLBACK27-NEXT: movl %edi, 60(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT: shlxl %ecx, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK27-NEXT: shldl %cl, %edx, %edi
+; FALLBACK27-NEXT: shldl %cl, %ebx, %esi
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK27-NEXT: movl %ebx, 48(%eax)
+; FALLBACK27-NEXT: movl %esi, 52(%eax)
+; FALLBACK27-NEXT: movl %ebp, 40(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 44(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 32(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 36(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 24(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 28(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 16(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 20(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 8(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 12(%eax)
+; FALLBACK27-NEXT: movl %edi, 4(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, (%eax)
+; FALLBACK27-NEXT: addl $204, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: shl_64bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $204, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK28-NEXT: movl (%eax), %eax
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: andl $60, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: subl %edx, %ecx
+; FALLBACK28-NEXT: movl (%ecx), %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 4(%ecx), %edx
+; FALLBACK28-NEXT: movl %ecx, %ebp
+; FALLBACK28-NEXT: shll $3, %eax
+; FALLBACK28-NEXT: andl $24, %eax
+; FALLBACK28-NEXT: movl %edx, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %al, %ch
+; FALLBACK28-NEXT: notb %ch
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %esi, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 12(%ebp), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 8(%ebp), %esi
+; FALLBACK28-NEXT: movl %ebp, %edi
+; FALLBACK28-NEXT: movl %esi, %ebp
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %esi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %edi, %ebp
+; FALLBACK28-NEXT: movl 20(%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 16(%edi), %esi
+; FALLBACK28-NEXT: movl %esi, %edx
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %esi, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %ebp, %edx
+; FALLBACK28-NEXT: movl 28(%ebp), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 24(%ebp), %esi
+; FALLBACK28-NEXT: movl %esi, %edi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %ebx, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %esi, %ebp
+; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 36(%edx), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 32(%edx), %esi
+; FALLBACK28-NEXT: movl %edx, %ebp
+; FALLBACK28-NEXT: movl %esi, %edi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %ebx, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %esi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 44(%ebp), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 40(%ebp), %esi
+; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %esi, %edx
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %esi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 52(%ebp), %esi
+; FALLBACK28-NEXT: movl %esi, %edi
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: negl %edx
+; FALLBACK28-NEXT: movl 176(%esp,%edx), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %edi, %ebp
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT: movl 60(%edi), %edx
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: movl 56(%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %edi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %edx, %edi
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: shrl %esi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %edx, (%eax)
+; FALLBACK28-NEXT: movl %esi, 56(%eax)
+; FALLBACK28-NEXT: movl %edi, 60(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 48(%eax)
+; FALLBACK28-NEXT: movl %ebp, 52(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 40(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 44(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 32(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 36(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 24(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 28(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 16(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 4(%eax)
+; FALLBACK28-NEXT: addl $204, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: shl_64bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $188, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK29-NEXT: movl (%eax), %ecx
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %ecx, %ebp
+; FALLBACK29-NEXT: andl $60, %ebp
+; FALLBACK29-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: subl %ebp, %eax
+; FALLBACK29-NEXT: movl 8(%eax), %esi
+; FALLBACK29-NEXT: movl 12(%eax), %edx
+; FALLBACK29-NEXT: shll $3, %ecx
+; FALLBACK29-NEXT: andl $24, %ecx
+; FALLBACK29-NEXT: movl %edx, %edi
+; FALLBACK29-NEXT: shldl %cl, %esi, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 4(%eax), %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %edi, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 16(%eax), %edi
+; FALLBACK29-NEXT: movl 20(%eax), %esi
+; FALLBACK29-NEXT: movl %esi, %ebx
+; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 24(%eax), %edi
+; FALLBACK29-NEXT: movl 28(%eax), %edx
+; FALLBACK29-NEXT: movl %edx, %ebx
+; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %esi, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 32(%eax), %edi
+; FALLBACK29-NEXT: movl 36(%eax), %esi
+; FALLBACK29-NEXT: movl %esi, %ebx
+; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 40(%eax), %edx
+; FALLBACK29-NEXT: movl 44(%eax), %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT: movl 56(%eax), %edx
+; FALLBACK29-NEXT: movl 60(%eax), %edi
+; FALLBACK29-NEXT: shldl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl (%eax), %ebx
+; FALLBACK29-NEXT: movl 52(%eax), %esi
+; FALLBACK29-NEXT: shldl %cl, %esi, %edx
+; FALLBACK29-NEXT: negl %ebp
+; FALLBACK29-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %edx, 56(%ebp)
+; FALLBACK29-NEXT: movl %edi, 60(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK29-NEXT: shll %cl, %ebx
+; FALLBACK29-NEXT: shldl %cl, %eax, %esi
+; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK29-NEXT: shldl %cl, %edi, %eax
+; FALLBACK29-NEXT: movl %eax, 48(%ebp)
+; FALLBACK29-NEXT: movl %esi, 52(%ebp)
+; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 40(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 44(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 32(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 36(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 24(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %ebx, (%ebp)
+; FALLBACK29-NEXT: movl %edx, 4(%ebp)
+; FALLBACK29-NEXT: addl $188, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: shl_64bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $204, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK30-NEXT: movl (%eax), %eax
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: leal (,%eax,8), %edx
+; FALLBACK30-NEXT: andl $24, %edx
+; FALLBACK30-NEXT: andl $60, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edi
+; FALLBACK30-NEXT: subl %eax, %edi
+; FALLBACK30-NEXT: movl (%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 4(%edi), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl %edx, %ebx
+; FALLBACK30-NEXT: notb %bl
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %esi
+; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx
+; FALLBACK30-NEXT: orl %ecx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 8(%edi), %esi
+; FALLBACK30-NEXT: movl %esi, %ecx
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT: movl 12(%edi), %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: shrl %eax
+; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 16(%edi), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %eax
+; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT: movl 20(%edi), %esi
+; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %eax, %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 24(%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT: movl 28(%edi), %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %eax, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 32(%edi), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %eax
+; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT: movl 36(%edi), %esi
+; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %eax, %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 40(%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT: movl 44(%edi), %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %eax, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 48(%edi), %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax
+; FALLBACK30-NEXT: movl 52(%edi), %esi
+; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ebp
+; FALLBACK30-NEXT: orl %eax, %ebp
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: negl %eax
+; FALLBACK30-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK30-NEXT: movl 56(%edi), %eax
+; FALLBACK30-NEXT: shlxl %edx, %eax, %edx
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %edx, %esi
+; FALLBACK30-NEXT: shrl %eax
+; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT: orl %eax, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK30-NEXT: movl %edx, (%eax)
+; FALLBACK30-NEXT: movl %esi, 56(%eax)
+; FALLBACK30-NEXT: movl %ecx, 60(%eax)
+; FALLBACK30-NEXT: movl %ebp, 48(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 52(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 40(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 44(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 32(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 36(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 24(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 28(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 16(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 20(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 8(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 12(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 4(%eax)
+; FALLBACK30-NEXT: addl $204, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: shl_64bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $204, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK31-NEXT: movl (%eax), %ebx
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: leal (,%ebx,8), %ecx
+; FALLBACK31-NEXT: andl $24, %ecx
+; FALLBACK31-NEXT: andl $60, %ebx
+; FALLBACK31-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: subl %ebx, %eax
+; FALLBACK31-NEXT: movl 4(%eax), %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 8(%eax), %edi
+; FALLBACK31-NEXT: movl 12(%eax), %edx
+; FALLBACK31-NEXT: movl %edx, %ebp
+; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %esi, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 16(%eax), %edi
+; FALLBACK31-NEXT: movl 20(%eax), %esi
+; FALLBACK31-NEXT: movl %esi, %ebp
+; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 24(%eax), %edi
+; FALLBACK31-NEXT: movl 28(%eax), %edx
+; FALLBACK31-NEXT: movl %edx, %ebp
+; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %esi, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 32(%eax), %edi
+; FALLBACK31-NEXT: movl 36(%eax), %esi
+; FALLBACK31-NEXT: movl %esi, %ebp
+; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 40(%eax), %ebp
+; FALLBACK31-NEXT: movl 44(%eax), %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %ebp, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %esi, %ebp
+; FALLBACK31-NEXT: movl 56(%eax), %edx
+; FALLBACK31-NEXT: movl 60(%eax), %edi
+; FALLBACK31-NEXT: shldl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl (%eax), %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 52(%eax), %esi
+; FALLBACK31-NEXT: shldl %cl, %esi, %edx
+; FALLBACK31-NEXT: negl %ebx
+; FALLBACK31-NEXT: movl 176(%esp,%ebx), %ebx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl %edx, 56(%eax)
+; FALLBACK31-NEXT: movl %edi, 60(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT: shlxl %ecx, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK31-NEXT: shldl %cl, %edx, %edi
+; FALLBACK31-NEXT: shldl %cl, %ebx, %esi
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK31-NEXT: movl %ebx, 48(%eax)
+; FALLBACK31-NEXT: movl %esi, 52(%eax)
+; FALLBACK31-NEXT: movl %ebp, 40(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 44(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 32(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 36(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 24(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 28(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 16(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 20(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 8(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 12(%eax)
+; FALLBACK31-NEXT: movl %edi, 4(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, (%eax)
+; FALLBACK31-NEXT: addl $204, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%byteOff = load i512, ptr %byteOff.ptr, align 1
%bitOff = shl i512 %byteOff, 3
@@ -2365,370 +16156,4089 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
ret void
}
define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: ashr_64bytes:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: pushq %rbx
-; X64-SSE2-NEXT: movq (%rdi), %rax
-; X64-SSE2-NEXT: movq 8(%rdi), %rcx
-; X64-SSE2-NEXT: movq 16(%rdi), %r8
-; X64-SSE2-NEXT: movq 24(%rdi), %r9
-; X64-SSE2-NEXT: movq 32(%rdi), %r10
-; X64-SSE2-NEXT: movq 40(%rdi), %r11
-; X64-SSE2-NEXT: movq 48(%rdi), %rbx
-; X64-SSE2-NEXT: movq 56(%rdi), %rdi
-; X64-SSE2-NEXT: movl (%rsi), %esi
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: sarq $63, %rdi
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $63, %esi
-; X64-SSE2-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-SSE2-NEXT: movq -120(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %r8
-; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r9
-; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r10
-; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %r11
-; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %rsi
-; X64-SSE2-NEXT: movq %rsi, 48(%rdx)
-; X64-SSE2-NEXT: movq %r11, 56(%rdx)
-; X64-SSE2-NEXT: movq %r10, 32(%rdx)
-; X64-SSE2-NEXT: movq %r9, 40(%rdx)
-; X64-SSE2-NEXT: movq %r8, 16(%rdx)
-; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT: movq %rax, (%rdx)
-; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
-; X64-SSE2-NEXT: popq %rbx
-; X64-SSE2-NEXT: retq
-;
-; X64-SSE42-LABEL: ashr_64bytes:
-; X64-SSE42: # %bb.0:
-; X64-SSE42-NEXT: movups (%rdi), %xmm0
-; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
-; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
-; X64-SSE42-NEXT: movq 48(%rdi), %rax
-; X64-SSE42-NEXT: movq 56(%rdi), %rcx
-; X64-SSE42-NEXT: movl (%rsi), %esi
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: sarq $63, %rcx
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $63, %esi
-; X64-SSE42-NEXT: movups -128(%rsp,%rsi), %xmm0
-; X64-SSE42-NEXT: movups -112(%rsp,%rsi), %xmm1
-; X64-SSE42-NEXT: movups -96(%rsp,%rsi), %xmm2
-; X64-SSE42-NEXT: movups -80(%rsp,%rsi), %xmm3
-; X64-SSE42-NEXT: movups %xmm3, 48(%rdx)
-; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
-; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
-; X64-SSE42-NEXT: movups %xmm0, (%rdx)
-; X64-SSE42-NEXT: retq
-;
-; X64-AVX-LABEL: ashr_64bytes:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX-NEXT: vmovups 32(%rdi), %xmm1
-; X64-AVX-NEXT: movq 48(%rdi), %rax
-; X64-AVX-NEXT: movq 56(%rdi), %rcx
-; X64-AVX-NEXT: movl (%rsi), %esi
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: sarq $63, %rcx
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: andl $63, %esi
-; X64-AVX-NEXT: vmovups -128(%rsp,%rsi), %xmm0
-; X64-AVX-NEXT: vmovups -112(%rsp,%rsi), %xmm1
-; X64-AVX-NEXT: vmovups -96(%rsp,%rsi), %xmm2
-; X64-AVX-NEXT: vmovups -80(%rsp,%rsi), %xmm3
-; X64-AVX-NEXT: vmovups %xmm3, 48(%rdx)
-; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT: vmovups %xmm2, 32(%rdx)
-; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT: vzeroupper
-; X64-AVX-NEXT: retq
-;
-; X86-SSE2-LABEL: ashr_64bytes:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: pushl %ebx
-; X86-SSE2-NEXT: pushl %edi
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $168, %esp
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl (%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 4(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 8(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 12(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 16(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 20(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 24(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 28(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 32(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 36(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 40(%eax), %ebp
-; X86-SSE2-NEXT: movl 44(%eax), %ebx
-; X86-SSE2-NEXT: movl 48(%eax), %edi
-; X86-SSE2-NEXT: movl 52(%eax), %esi
-; X86-SSE2-NEXT: movl 56(%eax), %edx
-; X86-SSE2-NEXT: movl 60(%eax), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl (%eax), %eax
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: sarl $31, %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $63, %eax
-; X86-SSE2-NEXT: movl 40(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 48(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 56(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 72(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebp
-; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebx
-; X86-SSE2-NEXT: movl 92(%esp,%eax), %edi
-; X86-SSE2-NEXT: movl 88(%esp,%eax), %esi
-; X86-SSE2-NEXT: movl 100(%esp,%eax), %edx
-; X86-SSE2-NEXT: movl 96(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl %ecx, 56(%eax)
-; X86-SSE2-NEXT: movl %edx, 60(%eax)
-; X86-SSE2-NEXT: movl %esi, 48(%eax)
-; X86-SSE2-NEXT: movl %edi, 52(%eax)
-; X86-SSE2-NEXT: movl %ebx, 40(%eax)
-; X86-SSE2-NEXT: movl %ebp, 44(%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 32(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 36(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 24(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 28(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 16(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 20(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 8(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 12(%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, (%eax)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $168, %esp
-; X86-SSE2-NEXT: popl %esi
-; X86-SSE2-NEXT: popl %edi
-; X86-SSE2-NEXT: popl %ebx
-; X86-SSE2-NEXT: popl %ebp
-; X86-SSE2-NEXT: retl
-;
-; X86-SSE42-LABEL: ashr_64bytes:
-; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: pushl %ebx
-; X86-SSE42-NEXT: pushl %edi
-; X86-SSE42-NEXT: pushl %esi
-; X86-SSE42-NEXT: subl $128, %esp
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT: movups (%edx), %xmm0
-; X86-SSE42-NEXT: movups 16(%edx), %xmm1
-; X86-SSE42-NEXT: movups 32(%edx), %xmm2
-; X86-SSE42-NEXT: movl 48(%edx), %esi
-; X86-SSE42-NEXT: movl 52(%edx), %edi
-; X86-SSE42-NEXT: movl 56(%edx), %ebx
-; X86-SSE42-NEXT: movl 60(%edx), %edx
-; X86-SSE42-NEXT: movl (%ecx), %ecx
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
-; X86-SSE42-NEXT: sarl $31, %edx
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andl $63, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm2
-; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm3
-; X86-SSE42-NEXT: movups %xmm3, 48(%eax)
-; X86-SSE42-NEXT: movups %xmm2, 32(%eax)
-; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
-; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $128, %esp
-; X86-SSE42-NEXT: popl %esi
-; X86-SSE42-NEXT: popl %edi
-; X86-SSE42-NEXT: popl %ebx
-; X86-SSE42-NEXT: retl
-;
-; X86-AVX-LABEL: ashr_64bytes:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: pushl %ebx
-; X86-AVX-NEXT: pushl %edi
-; X86-AVX-NEXT: pushl %esi
-; X86-AVX-NEXT: subl $128, %esp
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: vmovups (%edx), %ymm0
-; X86-AVX-NEXT: vmovups 32(%edx), %xmm1
-; X86-AVX-NEXT: movl 48(%edx), %esi
-; X86-AVX-NEXT: movl 52(%edx), %edi
-; X86-AVX-NEXT: movl 56(%edx), %ebx
-; X86-AVX-NEXT: movl 60(%edx), %edx
-; X86-AVX-NEXT: movl (%ecx), %ecx
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovups %ymm0, (%esp)
-; X86-AVX-NEXT: sarl $31, %edx
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andl $63, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm3
-; X86-AVX-NEXT: vmovups %xmm3, 48(%eax)
-; X86-AVX-NEXT: vmovups %xmm2, 32(%eax)
-; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
-; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $128, %esp
-; X86-AVX-NEXT: popl %esi
-; X86-AVX-NEXT: popl %edi
-; X86-AVX-NEXT: popl %ebx
-; X86-AVX-NEXT: vzeroupper
-; X86-AVX-NEXT: retl
+; FALLBACK0-LABEL: ashr_64bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %r15
+; FALLBACK0-NEXT: pushq %r14
+; FALLBACK0-NEXT: pushq %r13
+; FALLBACK0-NEXT: pushq %r12
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq 16(%rdi), %rax
+; FALLBACK0-NEXT: movq 32(%rdi), %rcx
+; FALLBACK0-NEXT: movq 48(%rdi), %r8
+; FALLBACK0-NEXT: movq (%rdi), %r9
+; FALLBACK0-NEXT: movq 8(%rdi), %r10
+; FALLBACK0-NEXT: movq 24(%rdi), %r11
+; FALLBACK0-NEXT: movq 40(%rdi), %rbx
+; FALLBACK0-NEXT: movq 56(%rdi), %r14
+; FALLBACK0-NEXT: movl (%rsi), %edi
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: sarq $63, %r14
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: leal (,%rdi,8), %eax
+; FALLBACK0-NEXT: andl $56, %eax
+; FALLBACK0-NEXT: andl $56, %edi
+; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8
+; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r9
+; FALLBACK0-NEXT: movq %r8, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %r14
+; FALLBACK0-NEXT: leaq (%r14,%r14), %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %r11, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: addq %r8, %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: orq %rbx, %r8
+; FALLBACK0-NEXT: movq %r9, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r15
+; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r11
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: orq %r15, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r14
+; FALLBACK0-NEXT: addq %r9, %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r9
+; FALLBACK0-NEXT: orq %r14, %r9
+; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %r14
+; FALLBACK0-NEXT: movq %r14, %r12
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r12
+; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13
+; FALLBACK0-NEXT: leaq (%r13,%r13), %r15
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r15
+; FALLBACK0-NEXT: orq %r12, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: addq %r14, %r14
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r14
+; FALLBACK0-NEXT: orq %rbx, %r14
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r13
+; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK0-NEXT: leaq (%rdi,%rdi), %rbx
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rbx
+; FALLBACK0-NEXT: orq %r13, %rbx
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: sarq %cl, %rdi
+; FALLBACK0-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK0-NEXT: movq %rbx, 48(%rdx)
+; FALLBACK0-NEXT: movq %r14, 32(%rdx)
+; FALLBACK0-NEXT: movq %r15, 40(%rdx)
+; FALLBACK0-NEXT: movq %r9, 16(%rdx)
+; FALLBACK0-NEXT: movq %r11, 24(%rdx)
+; FALLBACK0-NEXT: movq %r8, (%rdx)
+; FALLBACK0-NEXT: movq %r10, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: popq %r12
+; FALLBACK0-NEXT: popq %r13
+; FALLBACK0-NEXT: popq %r14
+; FALLBACK0-NEXT: popq %r15
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: ashr_64bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: pushq %r14
+; FALLBACK1-NEXT: pushq %rbx
+; FALLBACK1-NEXT: pushq %rax
+; FALLBACK1-NEXT: movq 24(%rdi), %rcx
+; FALLBACK1-NEXT: movq 40(%rdi), %r8
+; FALLBACK1-NEXT: movq 56(%rdi), %r9
+; FALLBACK1-NEXT: movq (%rdi), %r10
+; FALLBACK1-NEXT: movq 8(%rdi), %r11
+; FALLBACK1-NEXT: movq 16(%rdi), %rbx
+; FALLBACK1-NEXT: movq 32(%rdi), %r14
+; FALLBACK1-NEXT: movq 48(%rdi), %rdi
+; FALLBACK1-NEXT: movl (%rsi), %eax
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: sarq $63, %r9
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: leal (,%rax,8), %ecx
+; FALLBACK1-NEXT: andl $56, %ecx
+; FALLBACK1-NEXT: andl $56, %eax
+; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9
+; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r10
+; FALLBACK1-NEXT: movq %r9, %r8
+; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8
+; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi
+; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r9
+; FALLBACK1-NEXT: movq %r10, %r11
+; FALLBACK1-NEXT: shrdq %cl, %r9, %r11
+; FALLBACK1-NEXT: shrdq %cl, %r10, %rdi
+; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r10
+; FALLBACK1-NEXT: movq -88(%rsp,%rax), %rbx
+; FALLBACK1-NEXT: movq %rbx, %r14
+; FALLBACK1-NEXT: shrdq %cl, %r10, %r14
+; FALLBACK1-NEXT: shrdq %cl, %rbx, %r9
+; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK1-NEXT: shrdq %cl, %rax, %r10
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: sarq %cl, %rax
+; FALLBACK1-NEXT: movq %r10, 48(%rdx)
+; FALLBACK1-NEXT: movq %rax, 56(%rdx)
+; FALLBACK1-NEXT: movq %r9, 32(%rdx)
+; FALLBACK1-NEXT: movq %r14, 40(%rdx)
+; FALLBACK1-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT: movq %r11, 24(%rdx)
+; FALLBACK1-NEXT: movq %rsi, (%rdx)
+; FALLBACK1-NEXT: movq %r8, 8(%rdx)
+; FALLBACK1-NEXT: addq $8, %rsp
+; FALLBACK1-NEXT: popq %rbx
+; FALLBACK1-NEXT: popq %r14
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: ashr_64bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: pushq %rbp
+; FALLBACK2-NEXT: pushq %r15
+; FALLBACK2-NEXT: pushq %r14
+; FALLBACK2-NEXT: pushq %r13
+; FALLBACK2-NEXT: pushq %r12
+; FALLBACK2-NEXT: pushq %rbx
+; FALLBACK2-NEXT: pushq %rax
+; FALLBACK2-NEXT: movq 16(%rdi), %rcx
+; FALLBACK2-NEXT: movq 32(%rdi), %r8
+; FALLBACK2-NEXT: movq 48(%rdi), %r9
+; FALLBACK2-NEXT: movq (%rdi), %r10
+; FALLBACK2-NEXT: movq 8(%rdi), %r11
+; FALLBACK2-NEXT: movq 24(%rdi), %rbx
+; FALLBACK2-NEXT: movq 40(%rdi), %r14
+; FALLBACK2-NEXT: movq 56(%rdi), %rdi
+; FALLBACK2-NEXT: movl (%rsi), %eax
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: sarq $63, %rdi
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: leal (,%rax,8), %ecx
+; FALLBACK2-NEXT: andl $56, %ecx
+; FALLBACK2-NEXT: andl $56, %eax
+; FALLBACK2-NEXT: movq -120(%rsp,%rax), %r8
+; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi
+; FALLBACK2-NEXT: shrxq %rcx, %r8, %rbx
+; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK2-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbp
+; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r9
+; FALLBACK2-NEXT: shrxq %rcx, %r10, %r11
+; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15
+; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r13
+; FALLBACK2-NEXT: movl %ecx, %r12d
+; FALLBACK2-NEXT: notb %r12b
+; FALLBACK2-NEXT: addq %r10, %r10
+; FALLBACK2-NEXT: shlxq %r12, %r10, %r10
+; FALLBACK2-NEXT: orq %rbx, %r10
+; FALLBACK2-NEXT: addq %r8, %r8
+; FALLBACK2-NEXT: shlxq %r12, %r8, %r8
+; FALLBACK2-NEXT: orq %rbp, %r8
+; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx
+; FALLBACK2-NEXT: shrxq %rcx, %rbx, %rbp
+; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK2-NEXT: sarxq %rcx, %rax, %rcx
+; FALLBACK2-NEXT: addq %rdi, %rdi
+; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r9, %rdi
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r11, %rsi
+; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r9
+; FALLBACK2-NEXT: shlxq %r12, %r9, %r9
+; FALLBACK2-NEXT: orq %r15, %r9
+; FALLBACK2-NEXT: addq %r14, %r14
+; FALLBACK2-NEXT: shlxq %r12, %r14, %r11
+; FALLBACK2-NEXT: orq %r13, %r11
+; FALLBACK2-NEXT: addq %rax, %rax
+; FALLBACK2-NEXT: shlxq %r12, %rax, %rax
+; FALLBACK2-NEXT: orq %rbp, %rax
+; FALLBACK2-NEXT: movq %rcx, 56(%rdx)
+; FALLBACK2-NEXT: movq %rax, 48(%rdx)
+; FALLBACK2-NEXT: movq %r11, 32(%rdx)
+; FALLBACK2-NEXT: movq %r9, 40(%rdx)
+; FALLBACK2-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK2-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK2-NEXT: movq %r8, (%rdx)
+; FALLBACK2-NEXT: movq %r10, 8(%rdx)
+; FALLBACK2-NEXT: addq $8, %rsp
+; FALLBACK2-NEXT: popq %rbx
+; FALLBACK2-NEXT: popq %r12
+; FALLBACK2-NEXT: popq %r13
+; FALLBACK2-NEXT: popq %r14
+; FALLBACK2-NEXT: popq %r15
+; FALLBACK2-NEXT: popq %rbp
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: ashr_64bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: pushq %r14
+; FALLBACK3-NEXT: pushq %rbx
+; FALLBACK3-NEXT: pushq %rax
+; FALLBACK3-NEXT: movq 24(%rdi), %rcx
+; FALLBACK3-NEXT: movq 40(%rdi), %r8
+; FALLBACK3-NEXT: movq 56(%rdi), %r9
+; FALLBACK3-NEXT: movq (%rdi), %r10
+; FALLBACK3-NEXT: movq 8(%rdi), %r11
+; FALLBACK3-NEXT: movq 16(%rdi), %rbx
+; FALLBACK3-NEXT: movq 32(%rdi), %r14
+; FALLBACK3-NEXT: movq 48(%rdi), %rdi
+; FALLBACK3-NEXT: movl (%rsi), %eax
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: sarq $63, %r9
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: leal (,%rax,8), %ecx
+; FALLBACK3-NEXT: andl $56, %ecx
+; FALLBACK3-NEXT: andl $56, %eax
+; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9
+; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r10
+; FALLBACK3-NEXT: movq %r9, %r8
+; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8
+; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi
+; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r9
+; FALLBACK3-NEXT: movq %r10, %r11
+; FALLBACK3-NEXT: shrdq %cl, %r9, %r11
+; FALLBACK3-NEXT: shrdq %cl, %r10, %rdi
+; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r10
+; FALLBACK3-NEXT: movq -88(%rsp,%rax), %rbx
+; FALLBACK3-NEXT: movq %rbx, %r14
+; FALLBACK3-NEXT: shrdq %cl, %r10, %r14
+; FALLBACK3-NEXT: shrdq %cl, %rbx, %r9
+; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK3-NEXT: shrdq %cl, %rax, %r10
+; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: movq %r10, 48(%rdx)
+; FALLBACK3-NEXT: movq %r9, 32(%rdx)
+; FALLBACK3-NEXT: movq %r14, 40(%rdx)
+; FALLBACK3-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT: movq %r11, 24(%rdx)
+; FALLBACK3-NEXT: movq %rsi, (%rdx)
+; FALLBACK3-NEXT: movq %r8, 8(%rdx)
+; FALLBACK3-NEXT: movq %rax, 56(%rdx)
+; FALLBACK3-NEXT: addq $8, %rsp
+; FALLBACK3-NEXT: popq %rbx
+; FALLBACK3-NEXT: popq %r14
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: ashr_64bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbp
+; FALLBACK4-NEXT: pushq %r15
+; FALLBACK4-NEXT: pushq %r14
+; FALLBACK4-NEXT: pushq %r13
+; FALLBACK4-NEXT: pushq %r12
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: pushq %rax
+; FALLBACK4-NEXT: movq 56(%rdi), %rax
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT: movq 48(%rdi), %rcx
+; FALLBACK4-NEXT: movl (%rsi), %edi
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: sarq $63, %rax
+; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: leal (,%rdi,8), %eax
+; FALLBACK4-NEXT: andl $56, %eax
+; FALLBACK4-NEXT: andl $56, %edi
+; FALLBACK4-NEXT: movq -128(%rsp,%rdi), %rbx
+; FALLBACK4-NEXT: movq -112(%rsp,%rdi), %r8
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbx
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: movq -120(%rsp,%rdi), %r11
+; FALLBACK4-NEXT: movq -104(%rsp,%rdi), %r10
+; FALLBACK4-NEXT: leaq (%r11,%r11), %r9
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r9
+; FALLBACK4-NEXT: orq %rbx, %r9
+; FALLBACK4-NEXT: movq %r10, %r14
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r14
+; FALLBACK4-NEXT: movq -96(%rsp,%rdi), %r12
+; FALLBACK4-NEXT: leaq (%r12,%r12), %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r14, %rbx
+; FALLBACK4-NEXT: movq %r8, %r14
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r14
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r14, %r10
+; FALLBACK4-NEXT: movq -88(%rsp,%rdi), %r14
+; FALLBACK4-NEXT: movq %r14, %r13
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r13
+; FALLBACK4-NEXT: movq -80(%rsp,%rdi), %rbp
+; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r15
+; FALLBACK4-NEXT: orq %r13, %r15
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r12
+; FALLBACK4-NEXT: addq %r14, %r14
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r14
+; FALLBACK4-NEXT: orq %r12, %r14
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbp
+; FALLBACK4-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK4-NEXT: leaq (%rdi,%rdi), %r12
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r12
+; FALLBACK4-NEXT: orq %rbp, %r12
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: addq %r8, %r8
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r8
+; FALLBACK4-NEXT: orq %r11, %r8
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: sarq %cl, %rdi
+; FALLBACK4-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK4-NEXT: movq %r8, 8(%rdx)
+; FALLBACK4-NEXT: movq %r12, 48(%rdx)
+; FALLBACK4-NEXT: movq %r14, 32(%rdx)
+; FALLBACK4-NEXT: movq %r15, 40(%rdx)
+; FALLBACK4-NEXT: movq %r10, 16(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK4-NEXT: movq %r9, (%rdx)
+; FALLBACK4-NEXT: addq $8, %rsp
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: popq %r12
+; FALLBACK4-NEXT: popq %r13
+; FALLBACK4-NEXT: popq %r14
+; FALLBACK4-NEXT: popq %r15
+; FALLBACK4-NEXT: popq %rbp
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: ashr_64bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: pushq %r15
+; FALLBACK5-NEXT: pushq %r14
+; FALLBACK5-NEXT: pushq %rbx
+; FALLBACK5-NEXT: movq 48(%rdi), %rcx
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT: movq 56(%rdi), %rdi
+; FALLBACK5-NEXT: movl (%rsi), %eax
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: sarq $63, %rdi
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: andl $56, %ecx
+; FALLBACK5-NEXT: andl $56, %eax
+; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r10
+; FALLBACK5-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r11
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK5-NEXT: movq -96(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq %r10, %r8
+; FALLBACK5-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK5-NEXT: movq %r11, %rbx
+; FALLBACK5-NEXT: shrdq %cl, %r10, %rbx
+; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r10
+; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK5-NEXT: movq %r14, %r15
+; FALLBACK5-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK5-NEXT: shrdq %cl, %r14, %r9
+; FALLBACK5-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r10
+; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: sarq %cl, %rax
+; FALLBACK5-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK5-NEXT: movq %r10, 48(%rdx)
+; FALLBACK5-NEXT: movq %rax, 56(%rdx)
+; FALLBACK5-NEXT: movq %r9, 32(%rdx)
+; FALLBACK5-NEXT: movq %r15, 40(%rdx)
+; FALLBACK5-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK5-NEXT: movq %r8, 24(%rdx)
+; FALLBACK5-NEXT: movq %rsi, (%rdx)
+; FALLBACK5-NEXT: popq %rbx
+; FALLBACK5-NEXT: popq %r14
+; FALLBACK5-NEXT: popq %r15
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: ashr_64bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: pushq %rbp
+; FALLBACK6-NEXT: pushq %r15
+; FALLBACK6-NEXT: pushq %r14
+; FALLBACK6-NEXT: pushq %r13
+; FALLBACK6-NEXT: pushq %r12
+; FALLBACK6-NEXT: pushq %rbx
+; FALLBACK6-NEXT: pushq %rax
+; FALLBACK6-NEXT: movq 56(%rdi), %rcx
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT: movq 48(%rdi), %rdi
+; FALLBACK6-NEXT: movl (%rsi), %eax
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: sarq $63, %rcx
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: leal (,%rax,8), %ecx
+; FALLBACK6-NEXT: andl $56, %ecx
+; FALLBACK6-NEXT: andl $56, %eax
+; FALLBACK6-NEXT: shrxq %rcx, -128(%rsp,%rax), %r11
+; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK6-NEXT: shrxq %rcx, %rdi, %r12
+; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rsi
+; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK6-NEXT: shrxq %rcx, %rsi, %r9
+; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK6-NEXT: shrxq %rcx, %r10, %r14
+; FALLBACK6-NEXT: shrxq %rcx, %r13, %r15
+; FALLBACK6-NEXT: movl %ecx, %ebx
+; FALLBACK6-NEXT: notb %bl
+; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK6-NEXT: orq %r11, %r8
+; FALLBACK6-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK6-NEXT: orq %r12, %r11
+; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK6-NEXT: shrxq %rcx, %r12, %r13
+; FALLBACK6-NEXT: shrxq %rcx, %rbp, %rbp
+; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK6-NEXT: sarxq %rcx, %rax, %rcx
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK6-NEXT: orq %r9, %rdi
+; FALLBACK6-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK6-NEXT: orq %r14, %r9
+; FALLBACK6-NEXT: addq %r10, %r10
+; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK6-NEXT: orq %r15, %r10
+; FALLBACK6-NEXT: addq %rax, %rax
+; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK6-NEXT: orq %r13, %rax
+; FALLBACK6-NEXT: addq %rsi, %rsi
+; FALLBACK6-NEXT: shlxq %rbx, %rsi, %rsi
+; FALLBACK6-NEXT: orq %rbp, %rsi
+; FALLBACK6-NEXT: movq %rcx, 56(%rdx)
+; FALLBACK6-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK6-NEXT: movq %rax, 48(%rdx)
+; FALLBACK6-NEXT: movq %r10, 32(%rdx)
+; FALLBACK6-NEXT: movq %r9, 40(%rdx)
+; FALLBACK6-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %r8, (%rdx)
+; FALLBACK6-NEXT: addq $8, %rsp
+; FALLBACK6-NEXT: popq %rbx
+; FALLBACK6-NEXT: popq %r12
+; FALLBACK6-NEXT: popq %r13
+; FALLBACK6-NEXT: popq %r14
+; FALLBACK6-NEXT: popq %r15
+; FALLBACK6-NEXT: popq %rbp
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: ashr_64bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: pushq %r15
+; FALLBACK7-NEXT: pushq %r14
+; FALLBACK7-NEXT: pushq %rbx
+; FALLBACK7-NEXT: movq 48(%rdi), %rcx
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT: movq 56(%rdi), %rdi
+; FALLBACK7-NEXT: movl (%rsi), %eax
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: sarq $63, %rdi
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: andl $56, %ecx
+; FALLBACK7-NEXT: andl $56, %eax
+; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r8
+; FALLBACK7-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r9
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK7-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK7-NEXT: movq %r8, %r11
+; FALLBACK7-NEXT: shrdq %cl, %r10, %r11
+; FALLBACK7-NEXT: movq %r9, %rbx
+; FALLBACK7-NEXT: shrdq %cl, %r8, %rbx
+; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r8
+; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK7-NEXT: movq %r14, %r15
+; FALLBACK7-NEXT: shrdq %cl, %r8, %r15
+; FALLBACK7-NEXT: shrdq %cl, %r14, %r10
+; FALLBACK7-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r8
+; FALLBACK7-NEXT: shrdq %cl, %r9, %rdi
+; FALLBACK7-NEXT: sarxq %rcx, %rax, %rax
+; FALLBACK7-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK7-NEXT: movq %r8, 48(%rdx)
+; FALLBACK7-NEXT: movq %r10, 32(%rdx)
+; FALLBACK7-NEXT: movq %r15, 40(%rdx)
+; FALLBACK7-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK7-NEXT: movq %r11, 24(%rdx)
+; FALLBACK7-NEXT: movq %rsi, (%rdx)
+; FALLBACK7-NEXT: movq %rax, 56(%rdx)
+; FALLBACK7-NEXT: popq %rbx
+; FALLBACK7-NEXT: popq %r14
+; FALLBACK7-NEXT: popq %r15
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: ashr_64bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbp
+; FALLBACK8-NEXT: pushq %r15
+; FALLBACK8-NEXT: pushq %r14
+; FALLBACK8-NEXT: pushq %r13
+; FALLBACK8-NEXT: pushq %r12
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: pushq %rax
+; FALLBACK8-NEXT: movq 56(%rdi), %rax
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK8-NEXT: movq 48(%rdi), %rcx
+; FALLBACK8-NEXT: movl (%rsi), %edi
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: sarq $63, %rax
+; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: leal (,%rdi,8), %eax
+; FALLBACK8-NEXT: andl $56, %eax
+; FALLBACK8-NEXT: andl $56, %edi
+; FALLBACK8-NEXT: movq -128(%rsp,%rdi), %rbx
+; FALLBACK8-NEXT: movq -112(%rsp,%rdi), %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbx
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: movq -120(%rsp,%rdi), %r11
+; FALLBACK8-NEXT: movq -104(%rsp,%rdi), %r10
+; FALLBACK8-NEXT: leaq (%r11,%r11), %r9
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r9
+; FALLBACK8-NEXT: orq %rbx, %r9
+; FALLBACK8-NEXT: movq %r10, %r14
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r14
+; FALLBACK8-NEXT: movq -96(%rsp,%rdi), %r12
+; FALLBACK8-NEXT: leaq (%r12,%r12), %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r14, %rbx
+; FALLBACK8-NEXT: movq %r8, %r14
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r14
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r14, %r10
+; FALLBACK8-NEXT: movq -88(%rsp,%rdi), %r14
+; FALLBACK8-NEXT: movq %r14, %r13
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r13
+; FALLBACK8-NEXT: movq -80(%rsp,%rdi), %rbp
+; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r15
+; FALLBACK8-NEXT: orq %r13, %r15
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r12
+; FALLBACK8-NEXT: addq %r14, %r14
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r14
+; FALLBACK8-NEXT: orq %r12, %r14
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbp
+; FALLBACK8-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK8-NEXT: leaq (%rdi,%rdi), %r12
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r12
+; FALLBACK8-NEXT: orq %rbp, %r12
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: addq %r8, %r8
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r8
+; FALLBACK8-NEXT: orq %r11, %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: sarq %cl, %rdi
+; FALLBACK8-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK8-NEXT: movq %r8, 8(%rdx)
+; FALLBACK8-NEXT: movq %r12, 48(%rdx)
+; FALLBACK8-NEXT: movq %r14, 32(%rdx)
+; FALLBACK8-NEXT: movq %r15, 40(%rdx)
+; FALLBACK8-NEXT: movq %r10, 16(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK8-NEXT: movq %r9, (%rdx)
+; FALLBACK8-NEXT: addq $8, %rsp
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: popq %r12
+; FALLBACK8-NEXT: popq %r13
+; FALLBACK8-NEXT: popq %r14
+; FALLBACK8-NEXT: popq %r15
+; FALLBACK8-NEXT: popq %rbp
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: ashr_64bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: pushq %r15
+; FALLBACK9-NEXT: pushq %r14
+; FALLBACK9-NEXT: pushq %rbx
+; FALLBACK9-NEXT: movq 48(%rdi), %rcx
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK9-NEXT: movq 56(%rdi), %rdi
+; FALLBACK9-NEXT: movl (%rsi), %eax
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: sarq $63, %rdi
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: andl $56, %ecx
+; FALLBACK9-NEXT: andl $56, %eax
+; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r10
+; FALLBACK9-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r11
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK9-NEXT: movq -96(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq %r10, %r8
+; FALLBACK9-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK9-NEXT: movq %r11, %rbx
+; FALLBACK9-NEXT: shrdq %cl, %r10, %rbx
+; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r10
+; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK9-NEXT: movq %r14, %r15
+; FALLBACK9-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK9-NEXT: shrdq %cl, %r14, %r9
+; FALLBACK9-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r10
+; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: sarq %cl, %rax
+; FALLBACK9-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK9-NEXT: movq %r10, 48(%rdx)
+; FALLBACK9-NEXT: movq %rax, 56(%rdx)
+; FALLBACK9-NEXT: movq %r9, 32(%rdx)
+; FALLBACK9-NEXT: movq %r15, 40(%rdx)
+; FALLBACK9-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK9-NEXT: movq %r8, 24(%rdx)
+; FALLBACK9-NEXT: movq %rsi, (%rdx)
+; FALLBACK9-NEXT: popq %rbx
+; FALLBACK9-NEXT: popq %r14
+; FALLBACK9-NEXT: popq %r15
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: ashr_64bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: pushq %rbp
+; FALLBACK10-NEXT: pushq %r15
+; FALLBACK10-NEXT: pushq %r14
+; FALLBACK10-NEXT: pushq %r13
+; FALLBACK10-NEXT: pushq %r12
+; FALLBACK10-NEXT: pushq %rbx
+; FALLBACK10-NEXT: pushq %rax
+; FALLBACK10-NEXT: movq 56(%rdi), %rcx
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK10-NEXT: movq 48(%rdi), %rdi
+; FALLBACK10-NEXT: movl (%rsi), %eax
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: sarq $63, %rcx
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: leal (,%rax,8), %ecx
+; FALLBACK10-NEXT: andl $56, %ecx
+; FALLBACK10-NEXT: andl $56, %eax
+; FALLBACK10-NEXT: shrxq %rcx, -128(%rsp,%rax), %r11
+; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK10-NEXT: shrxq %rcx, %rdi, %r12
+; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rsi
+; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK10-NEXT: shrxq %rcx, %rsi, %r9
+; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK10-NEXT: shrxq %rcx, %r10, %r14
+; FALLBACK10-NEXT: shrxq %rcx, %r13, %r15
+; FALLBACK10-NEXT: movl %ecx, %ebx
+; FALLBACK10-NEXT: notb %bl
+; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK10-NEXT: orq %r11, %r8
+; FALLBACK10-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK10-NEXT: orq %r12, %r11
+; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK10-NEXT: shrxq %rcx, %r12, %r13
+; FALLBACK10-NEXT: shrxq %rcx, %rbp, %rbp
+; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK10-NEXT: sarxq %rcx, %rax, %rcx
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK10-NEXT: orq %r9, %rdi
+; FALLBACK10-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK10-NEXT: orq %r14, %r9
+; FALLBACK10-NEXT: addq %r10, %r10
+; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK10-NEXT: orq %r15, %r10
+; FALLBACK10-NEXT: addq %rax, %rax
+; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK10-NEXT: orq %r13, %rax
+; FALLBACK10-NEXT: addq %rsi, %rsi
+; FALLBACK10-NEXT: shlxq %rbx, %rsi, %rsi
+; FALLBACK10-NEXT: orq %rbp, %rsi
+; FALLBACK10-NEXT: movq %rcx, 56(%rdx)
+; FALLBACK10-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK10-NEXT: movq %rax, 48(%rdx)
+; FALLBACK10-NEXT: movq %r10, 32(%rdx)
+; FALLBACK10-NEXT: movq %r9, 40(%rdx)
+; FALLBACK10-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %r8, (%rdx)
+; FALLBACK10-NEXT: addq $8, %rsp
+; FALLBACK10-NEXT: popq %rbx
+; FALLBACK10-NEXT: popq %r12
+; FALLBACK10-NEXT: popq %r13
+; FALLBACK10-NEXT: popq %r14
+; FALLBACK10-NEXT: popq %r15
+; FALLBACK10-NEXT: popq %rbp
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: ashr_64bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: pushq %r15
+; FALLBACK11-NEXT: pushq %r14
+; FALLBACK11-NEXT: pushq %rbx
+; FALLBACK11-NEXT: movq 48(%rdi), %rcx
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK11-NEXT: movq 56(%rdi), %rdi
+; FALLBACK11-NEXT: movl (%rsi), %eax
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: sarq $63, %rdi
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: andl $56, %ecx
+; FALLBACK11-NEXT: andl $56, %eax
+; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r8
+; FALLBACK11-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r9
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK11-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK11-NEXT: movq %r8, %r11
+; FALLBACK11-NEXT: shrdq %cl, %r10, %r11
+; FALLBACK11-NEXT: movq %r9, %rbx
+; FALLBACK11-NEXT: shrdq %cl, %r8, %rbx
+; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r8
+; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK11-NEXT: movq %r14, %r15
+; FALLBACK11-NEXT: shrdq %cl, %r8, %r15
+; FALLBACK11-NEXT: shrdq %cl, %r14, %r10
+; FALLBACK11-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r8
+; FALLBACK11-NEXT: shrdq %cl, %r9, %rdi
+; FALLBACK11-NEXT: sarxq %rcx, %rax, %rax
+; FALLBACK11-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK11-NEXT: movq %r8, 48(%rdx)
+; FALLBACK11-NEXT: movq %r10, 32(%rdx)
+; FALLBACK11-NEXT: movq %r15, 40(%rdx)
+; FALLBACK11-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK11-NEXT: movq %r11, 24(%rdx)
+; FALLBACK11-NEXT: movq %rsi, (%rdx)
+; FALLBACK11-NEXT: movq %rax, 56(%rdx)
+; FALLBACK11-NEXT: popq %rbx
+; FALLBACK11-NEXT: popq %r14
+; FALLBACK11-NEXT: popq %r15
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: ashr_64bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbp
+; FALLBACK12-NEXT: pushq %r15
+; FALLBACK12-NEXT: pushq %r14
+; FALLBACK12-NEXT: pushq %r13
+; FALLBACK12-NEXT: pushq %r12
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: pushq %rax
+; FALLBACK12-NEXT: movq 56(%rdi), %rax
+; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK12-NEXT: movq 48(%rdi), %rcx
+; FALLBACK12-NEXT: movl (%rsi), %edi
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: sarq $63, %rax
+; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: leal (,%rdi,8), %eax
+; FALLBACK12-NEXT: andl $56, %eax
+; FALLBACK12-NEXT: andl $56, %edi
+; FALLBACK12-NEXT: movq -128(%rsp,%rdi), %rbx
+; FALLBACK12-NEXT: movq -112(%rsp,%rdi), %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbx
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: movq -120(%rsp,%rdi), %r11
+; FALLBACK12-NEXT: movq -104(%rsp,%rdi), %r10
+; FALLBACK12-NEXT: leaq (%r11,%r11), %r9
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r9
+; FALLBACK12-NEXT: orq %rbx, %r9
+; FALLBACK12-NEXT: movq %r10, %r14
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r14
+; FALLBACK12-NEXT: movq -96(%rsp,%rdi), %r12
+; FALLBACK12-NEXT: leaq (%r12,%r12), %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r14, %rbx
+; FALLBACK12-NEXT: movq %r8, %r14
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r14
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r14, %r10
+; FALLBACK12-NEXT: movq -88(%rsp,%rdi), %r14
+; FALLBACK12-NEXT: movq %r14, %r13
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r13
+; FALLBACK12-NEXT: movq -80(%rsp,%rdi), %rbp
+; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r15
+; FALLBACK12-NEXT: orq %r13, %r15
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r12
+; FALLBACK12-NEXT: addq %r14, %r14
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r14
+; FALLBACK12-NEXT: orq %r12, %r14
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbp
+; FALLBACK12-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK12-NEXT: leaq (%rdi,%rdi), %r12
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r12
+; FALLBACK12-NEXT: orq %rbp, %r12
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: addq %r8, %r8
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r8
+; FALLBACK12-NEXT: orq %r11, %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: sarq %cl, %rdi
+; FALLBACK12-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK12-NEXT: movq %r8, 8(%rdx)
+; FALLBACK12-NEXT: movq %r12, 48(%rdx)
+; FALLBACK12-NEXT: movq %r14, 32(%rdx)
+; FALLBACK12-NEXT: movq %r15, 40(%rdx)
+; FALLBACK12-NEXT: movq %r10, 16(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK12-NEXT: movq %r9, (%rdx)
+; FALLBACK12-NEXT: addq $8, %rsp
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: popq %r12
+; FALLBACK12-NEXT: popq %r13
+; FALLBACK12-NEXT: popq %r14
+; FALLBACK12-NEXT: popq %r15
+; FALLBACK12-NEXT: popq %rbp
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: ashr_64bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: pushq %r15
+; FALLBACK13-NEXT: pushq %r14
+; FALLBACK13-NEXT: pushq %rbx
+; FALLBACK13-NEXT: movq 48(%rdi), %rcx
+; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK13-NEXT: movq 56(%rdi), %rdi
+; FALLBACK13-NEXT: movl (%rsi), %eax
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: sarq $63, %rdi
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: leal (,%rax,8), %ecx
+; FALLBACK13-NEXT: andl $56, %ecx
+; FALLBACK13-NEXT: andl $56, %eax
+; FALLBACK13-NEXT: movq -120(%rsp,%rax), %rdi
+; FALLBACK13-NEXT: movq -104(%rsp,%rax), %r10
+; FALLBACK13-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK13-NEXT: movq -112(%rsp,%rax), %r11
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK13-NEXT: movq -96(%rsp,%rax), %r9
+; FALLBACK13-NEXT: movq %r10, %r8
+; FALLBACK13-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK13-NEXT: movq %r11, %rbx
+; FALLBACK13-NEXT: shrdq %cl, %r10, %rbx
+; FALLBACK13-NEXT: movq -80(%rsp,%rax), %r10
+; FALLBACK13-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK13-NEXT: movq %r14, %r15
+; FALLBACK13-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK13-NEXT: shrdq %cl, %r14, %r9
+; FALLBACK13-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK13-NEXT: shrdq %cl, %rax, %r10
+; FALLBACK13-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: sarq %cl, %rax
+; FALLBACK13-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK13-NEXT: movq %r10, 48(%rdx)
+; FALLBACK13-NEXT: movq %rax, 56(%rdx)
+; FALLBACK13-NEXT: movq %r9, 32(%rdx)
+; FALLBACK13-NEXT: movq %r15, 40(%rdx)
+; FALLBACK13-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK13-NEXT: movq %r8, 24(%rdx)
+; FALLBACK13-NEXT: movq %rsi, (%rdx)
+; FALLBACK13-NEXT: popq %rbx
+; FALLBACK13-NEXT: popq %r14
+; FALLBACK13-NEXT: popq %r15
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: ashr_64bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: pushq %rbp
+; FALLBACK14-NEXT: pushq %r15
+; FALLBACK14-NEXT: pushq %r14
+; FALLBACK14-NEXT: pushq %r13
+; FALLBACK14-NEXT: pushq %r12
+; FALLBACK14-NEXT: pushq %rbx
+; FALLBACK14-NEXT: pushq %rax
+; FALLBACK14-NEXT: movq 56(%rdi), %rcx
+; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK14-NEXT: movq 48(%rdi), %rdi
+; FALLBACK14-NEXT: movl (%rsi), %eax
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: sarq $63, %rcx
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: leal (,%rax,8), %ecx
+; FALLBACK14-NEXT: andl $56, %ecx
+; FALLBACK14-NEXT: andl $56, %eax
+; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rax), %r11
+; FALLBACK14-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK14-NEXT: shrxq %rcx, %rdi, %r12
+; FALLBACK14-NEXT: movq -112(%rsp,%rax), %rsi
+; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK14-NEXT: shrxq %rcx, %rsi, %r9
+; FALLBACK14-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK14-NEXT: shrxq %rcx, %r10, %r14
+; FALLBACK14-NEXT: shrxq %rcx, %r13, %r15
+; FALLBACK14-NEXT: movl %ecx, %ebx
+; FALLBACK14-NEXT: notb %bl
+; FALLBACK14-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK14-NEXT: orq %r11, %r8
+; FALLBACK14-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK14-NEXT: orq %r12, %r11
+; FALLBACK14-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK14-NEXT: shrxq %rcx, %r12, %r13
+; FALLBACK14-NEXT: shrxq %rcx, %rbp, %rbp
+; FALLBACK14-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK14-NEXT: sarxq %rcx, %rax, %rcx
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK14-NEXT: orq %r9, %rdi
+; FALLBACK14-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK14-NEXT: orq %r14, %r9
+; FALLBACK14-NEXT: addq %r10, %r10
+; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK14-NEXT: orq %r15, %r10
+; FALLBACK14-NEXT: addq %rax, %rax
+; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK14-NEXT: orq %r13, %rax
+; FALLBACK14-NEXT: addq %rsi, %rsi
+; FALLBACK14-NEXT: shlxq %rbx, %rsi, %rsi
+; FALLBACK14-NEXT: orq %rbp, %rsi
+; FALLBACK14-NEXT: movq %rcx, 56(%rdx)
+; FALLBACK14-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK14-NEXT: movq %rax, 48(%rdx)
+; FALLBACK14-NEXT: movq %r10, 32(%rdx)
+; FALLBACK14-NEXT: movq %r9, 40(%rdx)
+; FALLBACK14-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %r8, (%rdx)
+; FALLBACK14-NEXT: addq $8, %rsp
+; FALLBACK14-NEXT: popq %rbx
+; FALLBACK14-NEXT: popq %r12
+; FALLBACK14-NEXT: popq %r13
+; FALLBACK14-NEXT: popq %r14
+; FALLBACK14-NEXT: popq %r15
+; FALLBACK14-NEXT: popq %rbp
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: ashr_64bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: pushq %r15
+; FALLBACK15-NEXT: pushq %r14
+; FALLBACK15-NEXT: pushq %rbx
+; FALLBACK15-NEXT: movq 48(%rdi), %rcx
+; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK15-NEXT: movq 56(%rdi), %rdi
+; FALLBACK15-NEXT: movl (%rsi), %eax
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: sarq $63, %rdi
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: andl $56, %ecx
+; FALLBACK15-NEXT: andl $56, %eax
+; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r8
+; FALLBACK15-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r9
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK15-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK15-NEXT: movq %r8, %r11
+; FALLBACK15-NEXT: shrdq %cl, %r10, %r11
+; FALLBACK15-NEXT: movq %r9, %rbx
+; FALLBACK15-NEXT: shrdq %cl, %r8, %rbx
+; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r8
+; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK15-NEXT: movq %r14, %r15
+; FALLBACK15-NEXT: shrdq %cl, %r8, %r15
+; FALLBACK15-NEXT: shrdq %cl, %r14, %r10
+; FALLBACK15-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r8
+; FALLBACK15-NEXT: shrdq %cl, %r9, %rdi
+; FALLBACK15-NEXT: sarxq %rcx, %rax, %rax
+; FALLBACK15-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK15-NEXT: movq %r8, 48(%rdx)
+; FALLBACK15-NEXT: movq %r10, 32(%rdx)
+; FALLBACK15-NEXT: movq %r15, 40(%rdx)
+; FALLBACK15-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK15-NEXT: movq %r11, 24(%rdx)
+; FALLBACK15-NEXT: movq %rsi, (%rdx)
+; FALLBACK15-NEXT: movq %rax, 56(%rdx)
+; FALLBACK15-NEXT: popq %rbx
+; FALLBACK15-NEXT: popq %r14
+; FALLBACK15-NEXT: popq %r15
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: ashr_64bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $204, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl (%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 16(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 20(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 24(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 28(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 36(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 40(%ecx), %ebx
+; FALLBACK16-NEXT: movl 44(%ecx), %edi
+; FALLBACK16-NEXT: movl 48(%ecx), %esi
+; FALLBACK16-NEXT: movl 52(%ecx), %edx
+; FALLBACK16-NEXT: movl 56(%ecx), %eax
+; FALLBACK16-NEXT: movl 60(%ecx), %ecx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK16-NEXT: movl (%ebp), %ebp
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: sarl $31, %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, %ecx
+; FALLBACK16-NEXT: movl %ebp, %esi
+; FALLBACK16-NEXT: andl $60, %esi
+; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK16-NEXT: shll $3, %ecx
+; FALLBACK16-NEXT: andl $24, %ecx
+; FALLBACK16-NEXT: movl %edx, %eax
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 72(%esp,%esi), %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: addl %edi, %edi
+; FALLBACK16-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT: movl %ecx, %ebx
+; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT: notb %ch
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %eax, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 64(%esp,%esi), %eax
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %edx, %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %eax, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 76(%esp,%esi), %ebp
+; FALLBACK16-NEXT: movl %ebp, %edx
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %edx, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %ebp, %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %esi, %edx
+; FALLBACK16-NEXT: movl 84(%esp,%esi), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 88(%esp,%esi), %esi
+; FALLBACK16-NEXT: leal (%esi,%esi), %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %edx, %eax
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 92(%esp,%edx), %ebp
+; FALLBACK16-NEXT: movl %ebp, %edx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: movl 96(%esp,%eax), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %edx, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: addl %ebp, %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %esi, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl 100(%esp,%edx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 104(%esp,%edx), %esi
+; FALLBACK16-NEXT: leal (%esi,%esi), %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %ebx, %edx
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK16-NEXT: movl 108(%esp,%ebp), %edi
+; FALLBACK16-NEXT: movl %edi, %eax
+; FALLBACK16-NEXT: movl %edx, %ebx
+; FALLBACK16-NEXT: movl %ebx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 112(%esp,%ebp), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %ebp, %edx
+; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebp
+; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: addl %edi, %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %esi, %edi
+; FALLBACK16-NEXT: movl 116(%esp,%edx), %esi
+; FALLBACK16-NEXT: movl %esi, %eax
+; FALLBACK16-NEXT: movl %ebx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 120(%esp,%edx), %edx
+; FALLBACK16-NEXT: leal (%edx,%edx), %ebp
+; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %esi, %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %eax, %esi
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: movl %edx, %eax
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %eax, %edx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT: sarl %cl, %ebx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl %ebx, 60(%eax)
+; FALLBACK16-NEXT: movl %edx, 56(%eax)
+; FALLBACK16-NEXT: movl %esi, 48(%eax)
+; FALLBACK16-NEXT: movl %ebp, 52(%eax)
+; FALLBACK16-NEXT: movl %edi, 40(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 44(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 32(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 36(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 24(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 28(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 16(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 20(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 8(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 12(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, (%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 4(%eax)
+; FALLBACK16-NEXT: addl $204, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: ashr_64bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $188, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl (%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 12(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 20(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 24(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 28(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 36(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%eax), %ebp
+; FALLBACK17-NEXT: movl 44(%eax), %ebx
+; FALLBACK17-NEXT: movl 48(%eax), %edi
+; FALLBACK17-NEXT: movl 52(%eax), %esi
+; FALLBACK17-NEXT: movl 56(%eax), %edx
+; FALLBACK17-NEXT: movl 60(%eax), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %ecx
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: sarl $31, %eax
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ecx, %ebp
+; FALLBACK17-NEXT: andl $60, %ebp
+; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shll $3, %ecx
+; FALLBACK17-NEXT: andl $24, %ecx
+; FALLBACK17-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %esi
+; FALLBACK17-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl %esi, %edx
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl %edx, 56(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT: sarl %cl, %eax
+; FALLBACK17-NEXT: movl %eax, 60(%ebp)
+; FALLBACK17-NEXT: movl %esi, 48(%ebp)
+; FALLBACK17-NEXT: movl %edi, 52(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 40(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 44(%ebp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 32(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 36(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 24(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 28(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 16(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 20(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 8(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 12(%ebp)
+; FALLBACK17-NEXT: movl %ebx, (%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 4(%ebp)
+; FALLBACK17-NEXT: addl $188, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: ashr_64bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $204, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 12(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 16(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 20(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 24(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 28(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 32(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 36(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 40(%eax), %ebp
+; FALLBACK18-NEXT: movl 44(%eax), %ebx
+; FALLBACK18-NEXT: movl 48(%eax), %edi
+; FALLBACK18-NEXT: movl 52(%eax), %esi
+; FALLBACK18-NEXT: movl 56(%eax), %edx
+; FALLBACK18-NEXT: movl 60(%eax), %ecx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %eax
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: sarl $31, %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %eax, %ecx
+; FALLBACK18-NEXT: leal (,%eax,8), %edx
+; FALLBACK18-NEXT: andl $24, %edx
+; FALLBACK18-NEXT: andl $60, %ecx
+; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK18-NEXT: movl 72(%esp,%ecx), %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl %edx, %ebx
+; FALLBACK18-NEXT: notb %bl
+; FALLBACK18-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: orl %edi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK18-NEXT: leal (%esi,%esi), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK18-NEXT: leal (%esi,%esi), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK18-NEXT: movl %ecx, %edi
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %ecx, %esi
+; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK18-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK18-NEXT: orl %edi, %ecx
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: addl %eax, %eax
+; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK18-NEXT: sarxl %edx, %ebp, %edx
+; FALLBACK18-NEXT: addl %ebp, %ebp
+; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK18-NEXT: orl %eax, %ebx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl %edx, 60(%eax)
+; FALLBACK18-NEXT: movl %ebx, 56(%eax)
+; FALLBACK18-NEXT: movl %edi, 48(%eax)
+; FALLBACK18-NEXT: movl %ecx, 52(%eax)
+; FALLBACK18-NEXT: movl %esi, 40(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 44(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 32(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 36(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 24(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 28(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 16(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, (%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $204, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: ashr_64bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $188, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: movl (%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 12(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 20(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 24(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 28(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 36(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%eax), %ebp
+; FALLBACK19-NEXT: movl 44(%eax), %ebx
+; FALLBACK19-NEXT: movl 48(%eax), %edi
+; FALLBACK19-NEXT: movl 52(%eax), %esi
+; FALLBACK19-NEXT: movl 56(%eax), %edx
+; FALLBACK19-NEXT: movl 60(%eax), %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %ecx
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: sarl $31, %eax
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ecx, %ebp
+; FALLBACK19-NEXT: andl $60, %ebp
+; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shll $3, %ecx
+; FALLBACK19-NEXT: andl $24, %ecx
+; FALLBACK19-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %esi
+; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl %edi, %edx
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl %eax, 56(%ebp)
+; FALLBACK19-NEXT: movl %esi, 48(%ebp)
+; FALLBACK19-NEXT: movl %edx, 52(%ebp)
+; FALLBACK19-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 44(%ebp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 32(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 36(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 24(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 28(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 16(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 20(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 8(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 12(%ebp)
+; FALLBACK19-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, (%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK19-NEXT: movl %eax, 60(%ebp)
+; FALLBACK19-NEXT: addl $188, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: ashr_64bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $204, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT: movl 48(%ecx), %edx
+; FALLBACK20-NEXT: movl 52(%ecx), %esi
+; FALLBACK20-NEXT: movl 56(%ecx), %edi
+; FALLBACK20-NEXT: movl 60(%ecx), %ecx
+; FALLBACK20-NEXT: movl (%eax), %eax
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: sarl $31, %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %eax, %esi
+; FALLBACK20-NEXT: andl $60, %esi
+; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK20-NEXT: shll $3, %eax
+; FALLBACK20-NEXT: andl $24, %eax
+; FALLBACK20-NEXT: movl %edx, %edi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movb %al, %ch
+; FALLBACK20-NEXT: notb %ch
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %edx, %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %edi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK20-NEXT: movl %edx, %ebp
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %edx, %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: addl %eax, %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %eax, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK20-NEXT: leal (%edx,%edx), %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK20-NEXT: movl %edi, %ebp
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: addl %edi, %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl %edx, %edi
+; FALLBACK20-NEXT: movl %esi, %edx
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK20-NEXT: movl %esi, %ebx
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK20-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %eax, %edx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK20-NEXT: sarl %cl, %ebx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %ebx, 60(%eax)
+; FALLBACK20-NEXT: movl %edx, 56(%eax)
+; FALLBACK20-NEXT: movl %esi, 48(%eax)
+; FALLBACK20-NEXT: movl %ebp, 52(%eax)
+; FALLBACK20-NEXT: movl %edi, 40(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 44(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 32(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 36(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 24(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 28(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 16(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, (%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 4(%eax)
+; FALLBACK20-NEXT: addl $204, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: ashr_64bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $188, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movups (%eax), %xmm0
+; FALLBACK21-NEXT: movups 16(%eax), %xmm1
+; FALLBACK21-NEXT: movups 32(%eax), %xmm2
+; FALLBACK21-NEXT: movl 48(%eax), %edx
+; FALLBACK21-NEXT: movl 52(%eax), %esi
+; FALLBACK21-NEXT: movl 56(%eax), %edi
+; FALLBACK21-NEXT: movl 60(%eax), %eax
+; FALLBACK21-NEXT: movl (%ecx), %ecx
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: sarl $31, %eax
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %ecx, %ebp
+; FALLBACK21-NEXT: andl $60, %ebp
+; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shll $3, %ecx
+; FALLBACK21-NEXT: andl $24, %ecx
+; FALLBACK21-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %esi
+; FALLBACK21-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl %esi, %edx
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edi
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edi
+; FALLBACK21-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %edx, 56(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT: sarl %cl, %eax
+; FALLBACK21-NEXT: movl %eax, 60(%ebp)
+; FALLBACK21-NEXT: movl %esi, 48(%ebp)
+; FALLBACK21-NEXT: movl %edi, 52(%ebp)
+; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 40(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 44(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 32(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 36(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 24(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %ebx, (%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 4(%ebp)
+; FALLBACK21-NEXT: addl $188, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: ashr_64bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $204, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT: movl 48(%ecx), %edx
+; FALLBACK22-NEXT: movl 52(%ecx), %esi
+; FALLBACK22-NEXT: movl 56(%ecx), %edi
+; FALLBACK22-NEXT: movl 60(%ecx), %ecx
+; FALLBACK22-NEXT: movl (%eax), %eax
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: sarl $31, %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %eax, %ecx
+; FALLBACK22-NEXT: leal (,%eax,8), %edx
+; FALLBACK22-NEXT: andl $24, %edx
+; FALLBACK22-NEXT: andl $60, %ecx
+; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl 72(%esp,%ecx), %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl %edx, %ebx
+; FALLBACK22-NEXT: notb %bl
+; FALLBACK22-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK22-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT: orl %edi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT: orl %eax, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT: orl %eax, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl %ecx, %edi
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK22-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK22-NEXT: orl %edi, %ecx
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %eax, %eax
+; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK22-NEXT: sarxl %edx, %ebp, %edx
+; FALLBACK22-NEXT: addl %ebp, %ebp
+; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK22-NEXT: orl %eax, %ebx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl %edx, 60(%eax)
+; FALLBACK22-NEXT: movl %ebx, 56(%eax)
+; FALLBACK22-NEXT: movl %edi, 48(%eax)
+; FALLBACK22-NEXT: movl %ecx, 52(%eax)
+; FALLBACK22-NEXT: movl %esi, 40(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 44(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 32(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 36(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 24(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 28(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 16(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 20(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 8(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 12(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, (%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 4(%eax)
+; FALLBACK22-NEXT: addl $204, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: ashr_64bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $188, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movups (%eax), %xmm0
+; FALLBACK23-NEXT: movups 16(%eax), %xmm1
+; FALLBACK23-NEXT: movups 32(%eax), %xmm2
+; FALLBACK23-NEXT: movl 48(%eax), %edx
+; FALLBACK23-NEXT: movl 52(%eax), %esi
+; FALLBACK23-NEXT: movl 56(%eax), %edi
+; FALLBACK23-NEXT: movl 60(%eax), %eax
+; FALLBACK23-NEXT: movl (%ecx), %ecx
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: sarl $31, %eax
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %ecx, %ebp
+; FALLBACK23-NEXT: andl $60, %ebp
+; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shll $3, %ecx
+; FALLBACK23-NEXT: andl $24, %ecx
+; FALLBACK23-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %esi
+; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl %edi, %edx
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT: movl %eax, 56(%ebp)
+; FALLBACK23-NEXT: movl %esi, 48(%ebp)
+; FALLBACK23-NEXT: movl %edx, 52(%ebp)
+; FALLBACK23-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 44(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 32(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 36(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 24(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 28(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 16(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 20(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 8(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 12(%ebp)
+; FALLBACK23-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, (%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK23-NEXT: movl %eax, 60(%ebp)
+; FALLBACK23-NEXT: addl $188, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: ashr_64bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $204, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: vmovups 32(%ecx), %xmm1
+; FALLBACK24-NEXT: movl 48(%ecx), %edx
+; FALLBACK24-NEXT: movl 52(%ecx), %esi
+; FALLBACK24-NEXT: movl 56(%ecx), %edi
+; FALLBACK24-NEXT: movl 60(%ecx), %ecx
+; FALLBACK24-NEXT: movl (%eax), %eax
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: sarl $31, %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %eax, %esi
+; FALLBACK24-NEXT: andl $60, %esi
+; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK24-NEXT: shll $3, %eax
+; FALLBACK24-NEXT: andl $24, %eax
+; FALLBACK24-NEXT: movl %edx, %edi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT: movb %al, %ch
+; FALLBACK24-NEXT: notb %ch
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %edx, %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %edi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK24-NEXT: movl %edx, %ebp
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %edx, %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: addl %eax, %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %eax, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK24-NEXT: leal (%edx,%edx), %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK24-NEXT: movl %edi, %ebp
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: addl %edi, %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl %edx, %edi
+; FALLBACK24-NEXT: movl %esi, %edx
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK24-NEXT: movl %esi, %ebx
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK24-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %eax, %edx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK24-NEXT: sarl %cl, %ebx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %ebx, 60(%eax)
+; FALLBACK24-NEXT: movl %edx, 56(%eax)
+; FALLBACK24-NEXT: movl %esi, 48(%eax)
+; FALLBACK24-NEXT: movl %ebp, 52(%eax)
+; FALLBACK24-NEXT: movl %edi, 40(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 44(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 32(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 36(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 24(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 28(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 16(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, (%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 4(%eax)
+; FALLBACK24-NEXT: addl $204, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: ashr_64bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $188, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: vmovups (%eax), %ymm0
+; FALLBACK25-NEXT: vmovups 32(%eax), %xmm1
+; FALLBACK25-NEXT: movl 48(%eax), %edx
+; FALLBACK25-NEXT: movl 52(%eax), %esi
+; FALLBACK25-NEXT: movl 56(%eax), %edi
+; FALLBACK25-NEXT: movl 60(%eax), %eax
+; FALLBACK25-NEXT: movl (%ecx), %ecx
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: sarl $31, %eax
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %ecx, %ebp
+; FALLBACK25-NEXT: andl $60, %ebp
+; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shll $3, %ecx
+; FALLBACK25-NEXT: andl $24, %ecx
+; FALLBACK25-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %esi
+; FALLBACK25-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl %esi, %edx
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edi
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edi
+; FALLBACK25-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %edx, 56(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT: sarl %cl, %eax
+; FALLBACK25-NEXT: movl %eax, 60(%ebp)
+; FALLBACK25-NEXT: movl %esi, 48(%ebp)
+; FALLBACK25-NEXT: movl %edi, 52(%ebp)
+; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 40(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 44(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 32(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 36(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 24(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %ebx, (%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 4(%ebp)
+; FALLBACK25-NEXT: addl $188, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: ashr_64bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $204, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: vmovups 32(%ecx), %xmm1
+; FALLBACK26-NEXT: movl 48(%ecx), %edx
+; FALLBACK26-NEXT: movl 52(%ecx), %esi
+; FALLBACK26-NEXT: movl 56(%ecx), %edi
+; FALLBACK26-NEXT: movl 60(%ecx), %ecx
+; FALLBACK26-NEXT: movl (%eax), %eax
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: sarl $31, %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %eax, %ecx
+; FALLBACK26-NEXT: leal (,%eax,8), %edx
+; FALLBACK26-NEXT: andl $24, %edx
+; FALLBACK26-NEXT: andl $60, %ecx
+; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl 72(%esp,%ecx), %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl %edx, %ebx
+; FALLBACK26-NEXT: notb %bl
+; FALLBACK26-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK26-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT: orl %edi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT: orl %eax, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT: orl %eax, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl %ecx, %edi
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %ecx, %esi
+; FALLBACK26-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK26-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK26-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK26-NEXT: orl %edi, %ecx
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %eax, %eax
+; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK26-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK26-NEXT: sarxl %edx, %ebp, %edx
+; FALLBACK26-NEXT: addl %ebp, %ebp
+; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK26-NEXT: orl %eax, %ebx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl %edx, 60(%eax)
+; FALLBACK26-NEXT: movl %ebx, 56(%eax)
+; FALLBACK26-NEXT: movl %edi, 48(%eax)
+; FALLBACK26-NEXT: movl %ecx, 52(%eax)
+; FALLBACK26-NEXT: movl %esi, 40(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 44(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 32(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 36(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 24(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 28(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 16(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 20(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 8(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 12(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, (%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 4(%eax)
+; FALLBACK26-NEXT: addl $204, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: ashr_64bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $188, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: vmovups (%eax), %ymm0
+; FALLBACK27-NEXT: vmovups 32(%eax), %xmm1
+; FALLBACK27-NEXT: movl 48(%eax), %edx
+; FALLBACK27-NEXT: movl 52(%eax), %esi
+; FALLBACK27-NEXT: movl 56(%eax), %edi
+; FALLBACK27-NEXT: movl 60(%eax), %eax
+; FALLBACK27-NEXT: movl (%ecx), %ecx
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: sarl $31, %eax
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %ecx, %ebp
+; FALLBACK27-NEXT: andl $60, %ebp
+; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shll $3, %ecx
+; FALLBACK27-NEXT: andl $24, %ecx
+; FALLBACK27-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %esi
+; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl %edi, %edx
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT: movl %eax, 56(%ebp)
+; FALLBACK27-NEXT: movl %esi, 48(%ebp)
+; FALLBACK27-NEXT: movl %edx, 52(%ebp)
+; FALLBACK27-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 44(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 32(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 36(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 24(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 28(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 16(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 20(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 8(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 12(%ebp)
+; FALLBACK27-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, (%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK27-NEXT: movl %eax, 60(%ebp)
+; FALLBACK27-NEXT: addl $188, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: ashr_64bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $204, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT: vmovups 32(%ecx), %xmm1
+; FALLBACK28-NEXT: movl 48(%ecx), %edx
+; FALLBACK28-NEXT: movl 52(%ecx), %esi
+; FALLBACK28-NEXT: movl 56(%ecx), %edi
+; FALLBACK28-NEXT: movl 60(%ecx), %ecx
+; FALLBACK28-NEXT: movl (%eax), %eax
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: sarl $31, %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %eax, %esi
+; FALLBACK28-NEXT: andl $60, %esi
+; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK28-NEXT: shll $3, %eax
+; FALLBACK28-NEXT: andl $24, %eax
+; FALLBACK28-NEXT: movl %edx, %edi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT: movb %al, %ch
+; FALLBACK28-NEXT: notb %ch
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %edx, %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %edi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK28-NEXT: movl %edx, %ebp
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %edx, %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: addl %eax, %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %eax, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK28-NEXT: leal (%edx,%edx), %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK28-NEXT: movl %edi, %ebp
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: addl %edi, %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl %edx, %edi
+; FALLBACK28-NEXT: movl %esi, %edx
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK28-NEXT: movl %esi, %ebx
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK28-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %eax, %edx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK28-NEXT: sarl %cl, %ebx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %ebx, 60(%eax)
+; FALLBACK28-NEXT: movl %edx, 56(%eax)
+; FALLBACK28-NEXT: movl %esi, 48(%eax)
+; FALLBACK28-NEXT: movl %ebp, 52(%eax)
+; FALLBACK28-NEXT: movl %edi, 40(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 44(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 32(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 36(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 24(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 28(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 16(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, (%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 4(%eax)
+; FALLBACK28-NEXT: addl $204, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: ashr_64bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $188, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: vmovups (%eax), %ymm0
+; FALLBACK29-NEXT: vmovups 32(%eax), %xmm1
+; FALLBACK29-NEXT: movl 48(%eax), %edx
+; FALLBACK29-NEXT: movl 52(%eax), %esi
+; FALLBACK29-NEXT: movl 56(%eax), %edi
+; FALLBACK29-NEXT: movl 60(%eax), %eax
+; FALLBACK29-NEXT: movl (%ecx), %ecx
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: sarl $31, %eax
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %ecx, %ebp
+; FALLBACK29-NEXT: andl $60, %ebp
+; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shll $3, %ecx
+; FALLBACK29-NEXT: andl $24, %ecx
+; FALLBACK29-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %esi
+; FALLBACK29-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl %esi, %edx
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edi
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edi
+; FALLBACK29-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %edx, 56(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT: sarl %cl, %eax
+; FALLBACK29-NEXT: movl %eax, 60(%ebp)
+; FALLBACK29-NEXT: movl %esi, 48(%ebp)
+; FALLBACK29-NEXT: movl %edi, 52(%ebp)
+; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 40(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 44(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 32(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 36(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 24(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %ebx, (%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 4(%ebp)
+; FALLBACK29-NEXT: addl $188, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: ashr_64bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $204, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT: vmovups 32(%ecx), %xmm1
+; FALLBACK30-NEXT: movl 48(%ecx), %edx
+; FALLBACK30-NEXT: movl 52(%ecx), %esi
+; FALLBACK30-NEXT: movl 56(%ecx), %edi
+; FALLBACK30-NEXT: movl 60(%ecx), %ecx
+; FALLBACK30-NEXT: movl (%eax), %eax
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: sarl $31, %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %eax, %ecx
+; FALLBACK30-NEXT: leal (,%eax,8), %edx
+; FALLBACK30-NEXT: andl $24, %edx
+; FALLBACK30-NEXT: andl $60, %ecx
+; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK30-NEXT: movl 72(%esp,%ecx), %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl %edx, %ebx
+; FALLBACK30-NEXT: notb %bl
+; FALLBACK30-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK30-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT: orl %edi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT: orl %eax, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT: orl %eax, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK30-NEXT: movl %ecx, %edi
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %ecx, %esi
+; FALLBACK30-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK30-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK30-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK30-NEXT: orl %edi, %ecx
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %eax, %eax
+; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK30-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK30-NEXT: sarxl %edx, %ebp, %edx
+; FALLBACK30-NEXT: addl %ebp, %ebp
+; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK30-NEXT: orl %eax, %ebx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl %edx, 60(%eax)
+; FALLBACK30-NEXT: movl %ebx, 56(%eax)
+; FALLBACK30-NEXT: movl %edi, 48(%eax)
+; FALLBACK30-NEXT: movl %ecx, 52(%eax)
+; FALLBACK30-NEXT: movl %esi, 40(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 44(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 32(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 36(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 24(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 28(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 16(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 20(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 8(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 12(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, (%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 4(%eax)
+; FALLBACK30-NEXT: addl $204, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: ashr_64bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $188, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: vmovups (%eax), %ymm0
+; FALLBACK31-NEXT: vmovups 32(%eax), %xmm1
+; FALLBACK31-NEXT: movl 48(%eax), %edx
+; FALLBACK31-NEXT: movl 52(%eax), %esi
+; FALLBACK31-NEXT: movl 56(%eax), %edi
+; FALLBACK31-NEXT: movl 60(%eax), %eax
+; FALLBACK31-NEXT: movl (%ecx), %ecx
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: sarl $31, %eax
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %ecx, %ebp
+; FALLBACK31-NEXT: andl $60, %ebp
+; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shll $3, %ecx
+; FALLBACK31-NEXT: andl $24, %ecx
+; FALLBACK31-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %esi
+; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl %edi, %edx
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT: movl %eax, 56(%ebp)
+; FALLBACK31-NEXT: movl %esi, 48(%ebp)
+; FALLBACK31-NEXT: movl %edx, 52(%ebp)
+; FALLBACK31-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 44(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 32(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 36(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 24(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 28(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 16(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 20(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 8(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 12(%ebp)
+; FALLBACK31-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, (%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK31-NEXT: movl %eax, 60(%ebp)
+; FALLBACK31-NEXT: addl $188, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%byteOff = load i512, ptr %byteOff.ptr, align 1
%bitOff = shl i512 %byteOff, 3
@@ -2738,37 +20248,15 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
-; FALLBACK0: {{.*}}
-; FALLBACK1: {{.*}}
-; FALLBACK10: {{.*}}
-; FALLBACK11: {{.*}}
-; FALLBACK12: {{.*}}
-; FALLBACK13: {{.*}}
-; FALLBACK14: {{.*}}
-; FALLBACK15: {{.*}}
-; FALLBACK16: {{.*}}
-; FALLBACK17: {{.*}}
-; FALLBACK18: {{.*}}
-; FALLBACK19: {{.*}}
-; FALLBACK2: {{.*}}
-; FALLBACK20: {{.*}}
-; FALLBACK21: {{.*}}
-; FALLBACK22: {{.*}}
-; FALLBACK23: {{.*}}
-; FALLBACK24: {{.*}}
-; FALLBACK25: {{.*}}
-; FALLBACK26: {{.*}}
-; FALLBACK27: {{.*}}
-; FALLBACK28: {{.*}}
-; FALLBACK29: {{.*}}
-; FALLBACK3: {{.*}}
-; FALLBACK30: {{.*}}
-; FALLBACK31: {{.*}}
-; FALLBACK4: {{.*}}
-; FALLBACK5: {{.*}}
-; FALLBACK6: {{.*}}
-; FALLBACK7: {{.*}}
-; FALLBACK8: {{.*}}
-; FALLBACK9: {{.*}}
; X64: {{.*}}
+; X64-AVX: {{.*}}
+; X64-AVX1: {{.*}}
+; X64-AVX512: {{.*}}
+; X64-SSE2: {{.*}}
+; X64-SSE42: {{.*}}
; X86: {{.*}}
+; X86-AVX: {{.*}}
+; X86-AVX1: {{.*}}
+; X86-AVX512: {{.*}}
+; X86-SSE2: {{.*}}
+; X86-SSE42: {{.*}}
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 5c9c81758d633..4e33deb825500 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -588,22 +588,20 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $32, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
@@ -641,7 +639,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ebp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ebp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $32, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -654,41 +652,39 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $44, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $44, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -701,51 +697,49 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -758,42 +752,40 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebp, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edi), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%edi), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -884,64 +876,62 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $60, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $60, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -950,47 +940,48 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X86-NO-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $44, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, (%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %dl
; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %dl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edi), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $44, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: shl_16bytes:
@@ -999,30 +990,28 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %al
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %al
; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %al
; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %al, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al
@@ -1044,7 +1033,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1053,47 +1042,49 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X86-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, (%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %dl, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %dl, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1180,61 +1171,61 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $32, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %ch
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebx), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 8(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $32, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 8(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1247,42 +1238,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $44, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $44, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -1295,52 +1286,52 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esi), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1353,43 +1344,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebp, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edi), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%edi), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -1406,44 +1397,43 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8,8), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8,8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r9d
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9,8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil
; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9,8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9,8), %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8,8), %r8
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
@@ -1459,110 +1449,107 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax,8), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rsi,8), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi,8), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rdi,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi,8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rdi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax,8), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
@@ -1572,107 +1559,99 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi,4), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, %ch
; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %ch
; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebp,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi,4), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%ebx,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ebx,4), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%edi,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax,4), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%eax,4), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
@@ -1682,8 +1661,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 20(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
@@ -1691,7 +1670,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1704,77 +1683,73 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $80, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ebp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ebp), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ebp), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ebp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ebp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%eax,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%eax,4), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%eax,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%eax,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%eax,4), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%eax,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%eax,4), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%eax,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $80, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -1787,100 +1762,99 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 20(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 32(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1893,78 +1867,74 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $80, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%eax,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%eax,4), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%eax,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%eax,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%eax,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%eax,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%eax,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%eax,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 24(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ecx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%ecx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $80, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -1980,121 +1950,119 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
; X64-NO-BMI2-NO-SHLD-NEXT: andb $24, %cl
; X64-NO-BMI2-NO-SHLD-NEXT: negb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r10), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r10), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r8), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil
; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r10), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r10), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $24, %al
; X64-NO-BMI2-HAVE-SHLD-NEXT: negb %al
; X64-NO-BMI2-HAVE-SHLD-NEXT: movsbq %al, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %cl
; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %cl, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -16(%rsp,%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
@@ -2102,9 +2070,9 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r9, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax
@@ -2116,36 +2084,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $24, %al
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movsbq %al, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r8, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
@@ -2154,63 +2122,57 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $84, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %al
; X86-NO-BMI2-NO-SHLD-NEXT: andb $28, %al
; X86-NO-BMI2-NO-SHLD-NEXT: negb %al
-; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %al, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %al, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
@@ -2219,52 +2181,52 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edi), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 28(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -2273,7 +2235,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $84, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -2286,78 +2248,80 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $80, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $108, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ebp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ebp), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ebp), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ebp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ebp), %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $28, %al
; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %al, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %al, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebx), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%ebx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%ebx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 16(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $80, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $108, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -2370,99 +2334,89 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $88, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%esi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 84(%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 92(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -2473,7 +2427,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $88, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -2486,80 +2440,80 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $80, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $108, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $28, %al
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%ebx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%ebx), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $80, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 28(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 16(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $108, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -2575,45 +2529,45 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8,8), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8,8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r9d
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9,8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil
; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9,8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9,8), %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8,8), %r8
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
@@ -2629,113 +2583,113 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax,8), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: sarq $63, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rsi,8), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi,8), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rsi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rdi,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi,8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rdi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rcx, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarq $63, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax,8), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
@@ -2745,118 +2699,115 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%edx), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%eax), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %al
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx,4), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebp,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebx,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ebx,4), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%ebx,4), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%ebx,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ebx,4), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ebx,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%eax,4), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 16(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -2865,7 +2816,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -2878,78 +2829,79 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $80, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%eax,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%eax,4), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%eax,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%eax,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%eax,4), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%eax,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%eax,4), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%eax,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $80, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -2962,101 +2914,105 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 20(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 32(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ebx, %esi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -3069,79 +3025,80 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $80, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%eax,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%eax,4), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%eax,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%eax,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%eax,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%eax,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%eax,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%eax,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %eax, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 24(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ecx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%ecx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $80, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -3157,181 +3114,171 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %r8, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r15
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: addq %r11, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbp,%rbp), %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rdi), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r13,%r13), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: addq %r14, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 48(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r10
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r10
; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 48(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -3342,83 +3289,80 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rsi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r12b
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r14, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
@@ -3429,68 +3373,65 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r10
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r10
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 48(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 24(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -3499,42 +3440,46 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $192, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3543,6 +3488,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3555,61 +3501,46 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%edi), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3618,40 +3549,39 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edi), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
@@ -3660,11 +3590,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edi), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -3675,42 +3605,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edi), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edi), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edi), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edi), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -3723,19 +3653,19 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
@@ -3759,7 +3689,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -3772,7 +3702,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3802,6 +3732,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
@@ -3816,6 +3750,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -3828,22 +3763,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
@@ -3929,7 +3848,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -3942,7 +3861,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $184, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3963,24 +3882,28 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -3988,6 +3911,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4000,129 +3924,113 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 56(%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ecx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ecx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ecx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ecx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax)
@@ -4130,7 +4038,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
@@ -4150,7 +4058,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $184, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -4163,7 +4071,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $176, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4193,6 +4101,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
@@ -4207,6 +4119,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -4219,22 +4132,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
@@ -4318,7 +4215,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $176, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -4338,65 +4235,62 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: negl %esi
; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rbx), %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rbx), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rbx), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rbx), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rbx), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r15
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rbx), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rbx), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -4409,10 +4303,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r15
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%rbx), %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12
@@ -4429,11 +4323,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 48(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 32(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
@@ -4449,64 +4343,64 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
; X64-NO-BMI2-HAVE-SHLD-NEXT: negl %esi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r9), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r9), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r9), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r9), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r9), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r9), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r9), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r9), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r9), %r10
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r9), %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r9), %r11
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r9), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r11
; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
@@ -4519,83 +4413,82 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r13d
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r14, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r9, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r8, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r12, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 56(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 32(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
@@ -4608,63 +4501,63 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r8), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r8), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r8), %r10
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r8), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r8), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r8), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r8), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r8), %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r8), %r10
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r8), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rdi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 48(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 32(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 24(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
@@ -4675,7 +4568,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4697,22 +4590,22 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4721,6 +4614,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4733,74 +4629,83 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: subl %ecx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: subl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %ch
; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
@@ -4808,18 +4713,17 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
@@ -4827,92 +4731,72 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: negl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%eax), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%edi), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 56(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 48(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 56(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 52(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 40(%ecx)
@@ -4936,7 +4820,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -4949,7 +4833,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4979,6 +4863,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%ecx), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
@@ -4993,6 +4879,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5005,108 +4894,92 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %edx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %ebp, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 160(%esp,%edx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 160(%esp,%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 52(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -5119,36 +4992,38 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $192, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -5163,6 +5038,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5180,131 +5058,119 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 188(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
@@ -5330,7 +5196,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $192, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -5343,7 +5209,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $180, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -5364,7 +5230,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%ebx), %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%ebx), %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%ebx), %esi
@@ -5373,13 +5239,15 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -5387,6 +5255,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5405,22 +5276,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %ebx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi
@@ -5455,7 +5310,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %edi
@@ -5465,7 +5320,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 164(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 176(%esp,%ebx), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%eax)
@@ -5481,7 +5336,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax)
@@ -5502,7 +5357,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $180, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -5517,183 +5372,179 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %edi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %r8, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r15
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: addq %r11, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbp,%rbp), %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rdi), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r13,%r13), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: addq %r14, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 48(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r10
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r10
; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 48(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes:
@@ -5704,28 +5555,29 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: sarq $63, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
@@ -5734,54 +5586,53 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rsi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r12b
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r14, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
@@ -5792,69 +5643,69 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarq $63, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r10
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r10
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 48(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 24(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
@@ -5863,7 +5714,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6124,7 +5975,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -6137,7 +5988,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6295,7 +6146,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -6308,7 +6159,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $188, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6331,7 +6182,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi
@@ -6343,7 +6194,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -6387,8 +6238,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
@@ -6403,15 +6254,15 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 60(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6421,12 +6272,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6436,11 +6287,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6449,26 +6300,26 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6477,11 +6328,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
@@ -6489,7 +6340,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %eax, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ebx
@@ -6503,7 +6354,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
@@ -6523,7 +6374,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $188, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -6536,7 +6387,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $176, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6692,7 +6543,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $176, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 9ae1f270e8833..08d0eef07951c 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -432,30 +432,92 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X86-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movss %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movss %xmm0, (%esp)
+; X86-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: movb %dl, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movss %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -505,30 +567,92 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X86-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movss %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movss %xmm0, (%esp)
+; X86-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: movw %dx, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movss %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -577,30 +701,92 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X86-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movss %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movss %xmm0, (%esp)
+; X86-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: movl %edx, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movss %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -649,32 +835,134 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X86-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movss %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $36, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movss %xmm0, (%esp)
+; X86-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx), %edi
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl 8(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, 4(%eax)
+; X86-SHLD-NEXT: movl %edi, (%eax)
+; X86-SHLD-NEXT: addl $36, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movss %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -689,58 +977,123 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movzbl -64(%rsp,%rax), %eax
-; X64-NEXT: movb %al, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movb %al, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rax
+; X64-BMI2-NEXT: movb %al, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %ebx
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT: movb %bl, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -756,58 +1109,136 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rax
-; X64-NEXT: movw %ax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: addl %eax, %eax
+; X64-NO-BMI2-NEXT: andb $56, %cl
+; X64-NO-BMI2-NEXT: notb %cl
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NEXT: orl %esi, %eax
+; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT: andb $56, %sil
+; X64-BMI2-NEXT: notb %sil
+; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: addl %eax, %eax
+; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movw %cx, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movw %si, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -822,58 +1253,136 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movl -64(%rsp,%rax), %eax
-; X64-NEXT: movl %eax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: addl %eax, %eax
+; X64-NO-BMI2-NEXT: andb $56, %cl
+; X64-NO-BMI2-NEXT: notb %cl
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NEXT: orl %esi, %eax
+; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT: andb $56, %sil
+; X64-BMI2-NEXT: notb %sil
+; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: addl %eax, %eax
+; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movl %ecx, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -888,60 +1397,191 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rax
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movl %ecx, %eax
+; X64-SHLD-NEXT: shrb $6, %al
+; X64-SHLD-NEXT: movzbl %al, %eax
+; X64-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT: shrdq %cl, %rax, %rsi
+; X64-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $64, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 8(%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %edi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: movl %edx, %ebx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-SHLD-NEXT: movl %edi, (%eax)
+; X86-SHLD-NEXT: addl $64, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -956,70 +1596,288 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rcx
-; X64-NEXT: movq -56(%rsp,%rax), %rax
-; X64-NEXT: movq %rax, 8(%rdx)
-; X64-NEXT: movq %rcx, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %esi
-; X86-NEXT: movl 8(%esp,%ecx), %edi
-; X86-NEXT: movl 12(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rsi,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebp
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $92, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movups (%eax), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %eax
+; X86-SHLD-NEXT: shrb $5, %al
+; X86-SHLD-NEXT: movzbl %al, %ebx
+; X86-SHLD-NEXT: movl 24(%esp,%ebx,4), %esi
+; X86-SHLD-NEXT: movl 16(%esp,%ebx,4), %eax
+; X86-SHLD-NEXT: movl 20(%esp,%ebx,4), %edi
+; X86-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT: movl 28(%esp,%ebx,4), %ebp
+; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT: movl 32(%esp,%ebx,4), %ebx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: shrdl %cl, %ebx, %ebp
+; X86-SHLD-NEXT: movl %ebp, 12(%edx)
+; X86-SHLD-NEXT: movl %esi, 8(%edx)
+; X86-SHLD-NEXT: movl %edi, 4(%edx)
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT: movl %eax, (%edx)
+; X86-SHLD-NEXT: addl $92, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: popl %ebp
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -1034,84 +1892,155 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
}
define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movzbl -128(%rsp,%rsi), %eax
-; X64-NEXT: movb %al, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: pushq %rax
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: andl $56, %ecx
+; X64-NO-BMI2-NEXT: andl $56, %esi
+; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT: addl %esi, %esi
+; X64-NO-BMI2-NEXT: notl %ecx
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NEXT: orl %eax, %esi
+; X64-NO-BMI2-NEXT: movb %sil, (%rdx)
+; X64-NO-BMI2-NEXT: popq %rax
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: pushq %rax
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: leal (,%rsi,8), %eax
+; X64-BMI2-NEXT: andl $56, %eax
+; X64-BMI2-NEXT: andl $56, %esi
+; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT: notl %eax
+; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT: addl %esi, %esi
+; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movb %cl, (%rdx)
+; X64-BMI2-NEXT: popq %rax
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: subl $136, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-SHLD-NEXT: andl $60, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %ebx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT: movb %bl, (%eax)
+; X86-SHLD-NEXT: addl $136, %esp
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1127,84 +2056,155 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-NEXT: movw %ax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: pushq %rax
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: andl $56, %ecx
+; X64-NO-BMI2-NEXT: andl $56, %esi
+; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT: addl %esi, %esi
+; X64-NO-BMI2-NEXT: notl %ecx
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NEXT: orl %eax, %esi
+; X64-NO-BMI2-NEXT: movw %si, (%rdx)
+; X64-NO-BMI2-NEXT: popq %rax
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: pushq %rax
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: leal (,%rsi,8), %eax
+; X64-BMI2-NEXT: andl $56, %eax
+; X64-BMI2-NEXT: andl $56, %esi
+; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT: notl %eax
+; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT: addl %esi, %esi
+; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movw %cx, (%rdx)
+; X64-BMI2-NEXT: popq %rax
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $136, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-SHLD-NEXT: andl $60, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movw %si, (%eax)
+; X86-SHLD-NEXT: addl $136, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1219,84 +2219,155 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movl -128(%rsp,%rsi), %eax
-; X64-NEXT: movl %eax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: pushq %rax
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: andl $56, %ecx
+; X64-NO-BMI2-NEXT: andl $56, %esi
+; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT: addl %esi, %esi
+; X64-NO-BMI2-NEXT: notl %ecx
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NEXT: orl %eax, %esi
+; X64-NO-BMI2-NEXT: movl %esi, (%rdx)
+; X64-NO-BMI2-NEXT: popq %rax
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: pushq %rax
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: leal (,%rsi,8), %eax
+; X64-BMI2-NEXT: andl $56, %eax
+; X64-BMI2-NEXT: andl $56, %esi
+; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT: notl %eax
+; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT: addl %esi, %esi
+; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movl %ecx, (%rdx)
+; X64-BMI2-NEXT: popq %rax
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $136, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-SHLD-NEXT: andl $60, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, (%eax)
+; X86-SHLD-NEXT: addl $136, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1311,86 +2382,216 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: pushq %rax
+; X64-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT: andl $56, %esi
+; X64-SHLD-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT: movq %rax, (%rdx)
+; X64-SHLD-NEXT: popq %rax
+; X64-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $128, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %esi
+; X86-SHLD-NEXT: andl $60, %esi
+; X86-SHLD-NEXT: movl 8(%esp,%esi), %edi
+; X86-SHLD-NEXT: movl (%esp,%esi), %edx
+; X86-SHLD-NEXT: movl 4(%esp,%esi), %esi
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: andl $24, %ecx
+; X86-SHLD-NEXT: movl %esi, %ebx
+; X86-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-SHLD-NEXT: movl %edx, (%eax)
+; X86-SHLD-NEXT: addl $128, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $dl killed $dl killed $edx def $edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1405,96 +2606,326 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-NEXT: movq -120(%rsp,%rsi), %rcx
-; X64-NEXT: movq %rcx, 8(%rdx)
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %esi
-; X86-NEXT: movl 8(%esp,%ecx), %edi
-; X86-NEXT: movl 12(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: notl %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r10, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rdi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rax, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $156, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $156, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebp
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $156, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movups (%eax), %xmm0
+; X86-SHLD-NEXT: movups 16(%eax), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %edi
+; X86-SHLD-NEXT: andl $60, %edi
+; X86-SHLD-NEXT: movl 24(%esp,%edi), %esi
+; X86-SHLD-NEXT: movl 16(%esp,%edi), %eax
+; X86-SHLD-NEXT: movl 20(%esp,%edi), %ebx
+; X86-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: andl $24, %ecx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT: movl 28(%esp,%edi), %ebp
+; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT: movl 32(%esp,%edi), %edi
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: shrdl %cl, %edi, %ebp
+; X86-SHLD-NEXT: movl %ebp, 12(%edx)
+; X86-SHLD-NEXT: movl %esi, 8(%edx)
+; X86-SHLD-NEXT: movl %ebx, 4(%edx)
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT: movl %eax, (%edx)
+; X86-SHLD-NEXT: addl $156, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: popl %ebp
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1509,116 +2940,484 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
}
define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-NEXT: movq -120(%rsp,%rsi), %rcx
-; X64-NEXT: movq -112(%rsp,%rsi), %rdi
-; X64-NEXT: movq -104(%rsp,%rsi), %rsi
-; X64-NEXT: movq %rsi, 24(%rdx)
-; X64-NEXT: movq %rdi, 16(%rdx)
-; X64-NEXT: movq %rcx, 8(%rdx)
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $136, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movdqu (%ecx), %xmm0
-; X86-NEXT: movdqu 16(%ecx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %eax
-; X86-NEXT: movl 8(%esp,%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%esp,%eax), %ecx
-; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT: movl 16(%esp,%eax), %esi
-; X86-NEXT: movl 20(%esp,%eax), %edi
-; X86-NEXT: movl 24(%esp,%eax), %ebx
-; X86-NEXT: movl 28(%esp,%eax), %ebp
-; X86-NEXT: movl 32(%esp,%eax), %edx
-; X86-NEXT: movl 36(%esp,%eax), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ecx, 28(%eax)
-; X86-NEXT: movl %edx, 24(%eax)
-; X86-NEXT: movl %ebp, 20(%eax)
-; X86-NEXT: movl %ebx, 16(%eax)
-; X86-NEXT: movl %edi, 12(%eax)
-; X86-NEXT: movl %esi, 8(%eax)
-; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $136, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %r8b
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: notl %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r14, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r10,%r10), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %r11, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r9, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $172, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $172, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebp
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $156, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movups (%eax), %xmm0
+; X86-SHLD-NEXT: movups 16(%eax), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %edi
+; X86-SHLD-NEXT: andl $60, %edi
+; X86-SHLD-NEXT: movl 24(%esp,%edi), %edx
+; X86-SHLD-NEXT: movl 20(%esp,%edi), %eax
+; X86-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: andl $24, %ecx
+; X86-SHLD-NEXT: movl %eax, %esi
+; X86-SHLD-NEXT: movl %edx, %eax
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: movl 28(%esp,%edi), %edx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %eax
+; X86-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: movl 32(%esp,%edi), %ebp
+; X86-SHLD-NEXT: shrdl %cl, %ebp, %edx
+; X86-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-SHLD-NEXT: movl 36(%esp,%edi), %esi
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebp
+; X86-SHLD-NEXT: movl 40(%esp,%edi), %edx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl 44(%esp,%edi), %eax
+; X86-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT: movl 16(%esp,%edi), %ebx
+; X86-SHLD-NEXT: movl 48(%esp,%edi), %edi
+; X86-SHLD-NEXT: shrdl %cl, %edi, %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SHLD-NEXT: movl %eax, 28(%edi)
+; X86-SHLD-NEXT: movl %edx, 24(%edi)
+; X86-SHLD-NEXT: movl %esi, 20(%edi)
+; X86-SHLD-NEXT: movl %ebp, 16(%edi)
+; X86-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-SHLD-NEXT: movl %eax, 12(%edi)
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT: movl %eax, 8(%edi)
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT: movl %eax, 4(%edi)
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT: shrdl %cl, %eax, %ebx
+; X86-SHLD-NEXT: movl %ebx, (%edi)
+; X86-SHLD-NEXT: addl $156, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: popl %ebp
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 24(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1633,9 +3432,9 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
-; X64-HAVE-BMI2-HAVE-SHLD: {{.*}}
-; X64-NO-BMI2-HAVE-SHLD: {{.*}}
+; X64: {{.*}}
; X64-NO-SHLD: {{.*}}
+; X86: {{.*}}
; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
; X86-NO-BMI2-HAVE-SHLD: {{.*}}
; X86-NO-SHLD: {{.*}}
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 4a47e7613dfa6..aeb7b233f853d 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -603,32 +603,92 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movb %sil, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
-; X86-LABEL: load_1byte_chunk_of_16byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movdqu (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movdqu (%edx), %xmm0
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movd %xmm0, (%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: movb %dl, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <16 x i8> %init
@@ -711,32 +771,92 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
-; X86-LABEL: load_2byte_chunk_of_16byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movdqu (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movdqu (%edx), %xmm0
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movd %xmm0, (%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: movw %dx, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <16 x i8> %init
@@ -818,32 +938,92 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
-; X86-LABEL: load_4byte_chunk_of_16byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movdqu (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movdqu (%edx), %xmm0
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movd %xmm0, (%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: movl %edx, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <16 x i8> %init
@@ -925,34 +1105,137 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
-; X86-LABEL: load_8byte_chunk_of_16byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movdqu (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $36, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movdqu (%edx), %xmm0
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SHLD-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movd %xmm0, (%esp)
+; X86-SHLD-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx), %edi
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl 8(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, 4(%eax)
+; X86-SHLD-NEXT: movl %edi, (%eax)
+; X86-SHLD-NEXT: addl $36, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <16 x i8> %init
@@ -967,64 +1250,128 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; no @load_16byte_chunk_of_16byte_alloca
define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movzbl -64(%rsp,%rax), %eax
-; X64-NEXT: movb %al, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movb %al, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rax
+; X64-BMI2-NEXT: movb %al, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %ebx
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT: movb %bl, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1038,64 +1385,141 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
}
define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rax
-; X64-NEXT: movw %ax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: addl %eax, %eax
+; X64-NO-BMI2-NEXT: andb $56, %cl
+; X64-NO-BMI2-NEXT: notb %cl
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NEXT: orl %esi, %eax
+; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT: andb $56, %sil
+; X64-BMI2-NEXT: notb %sil
+; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: addl %eax, %eax
+; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movw %cx, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movw %si, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1108,64 +1532,141 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
}
define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movl -64(%rsp,%rax), %eax
-; X64-NEXT: movl %eax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: addl %eax, %eax
+; X64-NO-BMI2-NEXT: andb $56, %cl
+; X64-NO-BMI2-NEXT: notb %cl
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NEXT: orl %esi, %eax
+; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT: andb $56, %sil
+; X64-BMI2-NEXT: notb %sil
+; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: addl %eax, %eax
+; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movl %ecx, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1178,66 +1679,197 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
}
define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rax
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movl %ecx, %eax
+; X64-SHLD-NEXT: shrb $6, %al
+; X64-SHLD-NEXT: movzbl %al, %eax
+; X64-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT: shrdq %cl, %rax, %rsi
+; X64-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $64, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 8(%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %edi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: movl %edx, %ebx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-SHLD-NEXT: movl %edi, (%eax)
+; X86-SHLD-NEXT: addl $64, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1250,76 +1882,295 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
}
define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rcx
-; X64-NEXT: movq -56(%rsp,%rax), %rax
-; X64-NEXT: movq %rax, 8(%rdx)
-; X64-NEXT: movq %rcx, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %esi
-; X86-NEXT: movl 8(%esp,%ecx), %edi
-; X86-NEXT: movl 12(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rsi,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebp
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $92, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movups (%eax), %xmm0
+; X86-SHLD-NEXT: movups 16(%eax), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %eax
+; X86-SHLD-NEXT: shrb $5, %al
+; X86-SHLD-NEXT: movzbl %al, %ebx
+; X86-SHLD-NEXT: movl 24(%esp,%ebx,4), %esi
+; X86-SHLD-NEXT: movl 16(%esp,%ebx,4), %eax
+; X86-SHLD-NEXT: movl 20(%esp,%ebx,4), %edi
+; X86-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT: movl 28(%esp,%ebx,4), %ebp
+; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT: movl 32(%esp,%ebx,4), %ebx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: shrdl %cl, %ebx, %ebp
+; X86-SHLD-NEXT: movl %ebp, 12(%edx)
+; X86-SHLD-NEXT: movl %esi, 8(%edx)
+; X86-SHLD-NEXT: movl %edi, 4(%edx)
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT: movl %eax, (%edx)
+; X86-SHLD-NEXT: addl $92, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: popl %ebp
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1334,7 +2185,7 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
; no @load_32byte_chunk_of_32byte_alloca
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
+; X64: {{.*}}
; X64-NO-SHLD: {{.*}}
-; X64-SHLD: {{.*}}
+; X86: {{.*}}
; X86-NO-SHLD: {{.*}}
-; X86-SHLD: {{.*}}
More information about the llvm-commits
mailing list