[llvm] 3e0a76b - [Codegen][LegalizeIntegerTypes] Improve shift through stack (#96151)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 23 02:45:53 PDT 2024
Author: futog
Date: 2024-09-23T11:45:43+02:00
New Revision: 3e0a76b1fd10d2f5f36d34a91b525c1d29685185
URL: https://github.com/llvm/llvm-project/commit/3e0a76b1fd10d2f5f36d34a91b525c1d29685185
DIFF: https://github.com/llvm/llvm-project/commit/3e0a76b1fd10d2f5f36d34a91b525c1d29685185.diff
LOG: [Codegen][LegalizeIntegerTypes] Improve shift through stack (#96151)
Minor improvement on cc39c3b17fb2598e20ca0854f9fe6d69169d85c7.
Use an aligned stack slot to store the shifted value.
Use the native register width as shifting unit, so the load of the
shift result is aligned.
If the shift amount is a multiple of the native register width, there is
no need to do a follow-up shift after the load. I added new tests for
these cases.
Co-authored-by: Gergely Futo <gergely.futo at hightec-rt.com>
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
llvm/test/CodeGen/Mips/llvm-ir/shl.ll
llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
llvm/test/CodeGen/PowerPC/pr59074.ll
llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
llvm/test/CodeGen/RISCV/shifts.ll
llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
llvm/test/CodeGen/X86/pr38539.ll
llvm/test/CodeGen/X86/scheduler-backtracking.ll
llvm/test/CodeGen/X86/shift-i128.ll
llvm/test/CodeGen/X86/shift-i256.ll
llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index c622b2abedeacf..ee9c95c8593766 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4608,14 +4608,23 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
SDValue ShAmt = N->getOperand(1);
EVT ShAmtVT = ShAmt.getValueType();
- // This legalization is optimal when the shift is by a multiple of byte width,
- // %x * 8 <-> %x << 3 so 3 low bits should be be known zero.
- bool ShiftByByteMultiple =
- DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= 3;
+ EVT LoadVT = VT;
+ do {
+ LoadVT = TLI.getTypeToTransformTo(*DAG.getContext(), LoadVT);
+ } while (!TLI.isTypeLegal(LoadVT));
+
+ const unsigned ShiftUnitInBits = LoadVT.getStoreSizeInBits();
+ assert(ShiftUnitInBits <= VT.getScalarSizeInBits());
+ assert(isPowerOf2_32(ShiftUnitInBits) &&
+ "Shifting unit is not a a power of two!");
+
+ const bool IsOneStepShift =
+ DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >=
+ Log2_32(ShiftUnitInBits);
// If we can't do it as one step, we'll have two uses of shift amount,
// and thus must freeze it.
- if (!ShiftByByteMultiple)
+ if (!IsOneStepShift)
ShAmt = DAG.getFreeze(ShAmt);
unsigned VTBitWidth = VT.getScalarSizeInBits();
@@ -4629,10 +4638,9 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
// Get a temporary stack slot 2x the width of our VT.
// FIXME: reuse stack slots?
- // FIXME: should we be more picky about alignment?
- Align StackSlotAlignment(1);
- SDValue StackPtr = DAG.CreateStackTemporary(
- TypeSize::getFixed(StackSlotByteWidth), StackSlotAlignment);
+ Align StackAlign = DAG.getReducedAlign(StackSlotVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(StackSlotVT.getStoreSize(), StackAlign);
EVT PtrTy = StackPtr.getValueType();
SDValue Ch = DAG.getEntryNode();
@@ -4652,15 +4660,22 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
Init = DAG.getNode(ISD::BUILD_PAIR, dl, StackSlotVT, AllZeros, Shiftee);
}
// And spill it into the stack slot.
- Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo, StackSlotAlignment);
+ Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo, StackAlign);
// Now, compute the full-byte offset into stack slot from where we can load.
- // We have shift amount, which is in bits, but in multiples of byte.
- // So just divide by CHAR_BIT.
+ // We have shift amount, which is in bits. Offset should point to an aligned
+ // address.
SDNodeFlags Flags;
- if (ShiftByByteMultiple)
- Flags.setExact(true);
- SDValue ByteOffset = DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt,
+ Flags.setExact(IsOneStepShift);
+ SDValue SrlTmp = DAG.getNode(
+ ISD::SRL, dl, ShAmtVT, ShAmt,
+ DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT), Flags);
+ SDValue BitOffset =
+ DAG.getNode(ISD::SHL, dl, ShAmtVT, SrlTmp,
+ DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT));
+
+ Flags.setExact(true);
+ SDValue ByteOffset = DAG.getNode(ISD::SRL, dl, ShAmtVT, BitOffset,
DAG.getConstant(3, dl, ShAmtVT), Flags);
// And clamp it, because OOB load is an immediate UB,
// while shift overflow would have *just* been poison.
@@ -4689,15 +4704,16 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, ByteOffset, dl);
// And load it! While the load is not legal, legalizing it is obvious.
- SDValue Res = DAG.getLoad(
- VT, dl, Ch, AdjStackPtr,
- MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), Align(1));
- // We've performed the shift by a CHAR_BIT * [_ShAmt / CHAR_BIT_]
-
- // If we may still have a less-than-CHAR_BIT to shift by, do so now.
- if (!ShiftByByteMultiple) {
- SDValue ShAmtRem = DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
- DAG.getConstant(7, dl, ShAmtVT));
+ SDValue Res =
+ DAG.getLoad(VT, dl, Ch, AdjStackPtr,
+ MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()),
+ commonAlignment(StackAlign, LoadVT.getStoreSize()));
+
+ // If we may still have a remaining bits to shift by, do so now.
+ if (!IsOneStepShift) {
+ SDValue ShAmtRem =
+ DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
+ DAG.getConstant(ShiftUnitInBits - 1, dl, ShAmtVT));
Res = DAG.getNode(N->getOpcode(), dl, VT, Res, ShAmtRem);
}
diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
index e21015ad3db30c..b02788ab1b34c1 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -186,10 +186,54 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldr q1, [x0]
; ALL-NEXT: stp x9, x8, [sp, #16]
; ALL-NEXT: mov x8, sp
-; ALL-NEXT: and x9, x10, #0x1f
+; ALL-NEXT: and x9, x10, #0x18
; ALL-NEXT: str q1, [sp]
; ALL-NEXT: add x8, x8, x9
+; ALL-NEXT: lsl x9, x10, #3
; ALL-NEXT: stp q0, q0, [sp, #32]
+; ALL-NEXT: ldp x11, x10, [x8, #16]
+; ALL-NEXT: mvn w13, w9
+; ALL-NEXT: ldp x8, x12, [x8]
+; ALL-NEXT: and x9, x9, #0x38
+; ALL-NEXT: lsl x14, x10, #1
+; ALL-NEXT: lsl x15, x11, #1
+; ALL-NEXT: lsr x11, x11, x9
+; ALL-NEXT: lsl x16, x12, #1
+; ALL-NEXT: lsr x10, x10, x9
+; ALL-NEXT: lsr x12, x12, x9
+; ALL-NEXT: lsl x14, x14, x13
+; ALL-NEXT: lsr x8, x8, x9
+; ALL-NEXT: lsl x9, x16, x13
+; ALL-NEXT: lsl x13, x15, x13
+; ALL-NEXT: orr x11, x14, x11
+; ALL-NEXT: orr x8, x9, x8
+; ALL-NEXT: orr x9, x12, x13
+; ALL-NEXT: stp x11, x10, [x2, #16]
+; ALL-NEXT: stp x8, x9, [x2]
+; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: lshr_32bytes_dwordOff:
+; ALL: // %bb.0:
+; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: ldp x9, x8, [x0, #16]
+; ALL-NEXT: movi v0.2d, #0000000000000000
+; ALL-NEXT: ldr x10, [x1]
+; ALL-NEXT: ldr q1, [x0]
+; ALL-NEXT: stp x9, x8, [sp, #16]
+; ALL-NEXT: ubfiz x8, x10, #3, #2
+; ALL-NEXT: mov x9, sp
+; ALL-NEXT: str q1, [sp]
+; ALL-NEXT: stp q0, q0, [sp, #32]
+; ALL-NEXT: add x8, x9, x8
; ALL-NEXT: ldp x10, x9, [x8, #16]
; ALL-NEXT: ldr q0, [x8]
; ALL-NEXT: str q0, [x2]
@@ -197,12 +241,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
%res = lshr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
+
define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: shl_32bytes:
; ALL: // %bb.0:
@@ -213,11 +258,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldr q1, [x0]
; ALL-NEXT: stp x9, x8, [sp, #48]
; ALL-NEXT: mov x8, sp
-; ALL-NEXT: and x9, x10, #0x1f
+; ALL-NEXT: and x9, x10, #0x18
; ALL-NEXT: add x8, x8, #32
; ALL-NEXT: stp q0, q0, [sp]
; ALL-NEXT: str q1, [sp, #32]
; ALL-NEXT: sub x8, x8, x9
+; ALL-NEXT: lsl x9, x10, #3
+; ALL-NEXT: ldp x10, x11, [x8]
+; ALL-NEXT: ldp x12, x8, [x8, #16]
+; ALL-NEXT: mvn w13, w9
+; ALL-NEXT: and x9, x9, #0x38
+; ALL-NEXT: lsr x14, x10, #1
+; ALL-NEXT: lsr x15, x11, #1
+; ALL-NEXT: lsl x11, x11, x9
+; ALL-NEXT: lsr x16, x12, #1
+; ALL-NEXT: lsl x10, x10, x9
+; ALL-NEXT: lsl x12, x12, x9
+; ALL-NEXT: lsr x14, x14, x13
+; ALL-NEXT: lsl x8, x8, x9
+; ALL-NEXT: lsr x9, x16, x13
+; ALL-NEXT: lsr x13, x15, x13
+; ALL-NEXT: orr x11, x11, x14
+; ALL-NEXT: orr x8, x8, x9
+; ALL-NEXT: orr x9, x12, x13
+; ALL-NEXT: stp x10, x11, [x2]
+; ALL-NEXT: stp x9, x8, [x2, #16]
+; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: shl_32bytes_dwordOff:
+; ALL: // %bb.0:
+; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: ldp x9, x8, [x0, #16]
+; ALL-NEXT: movi v0.2d, #0000000000000000
+; ALL-NEXT: ldr x10, [x1]
+; ALL-NEXT: ldr q1, [x0]
+; ALL-NEXT: stp x9, x8, [sp, #48]
+; ALL-NEXT: mov x8, sp
+; ALL-NEXT: ubfiz x9, x10, #3, #2
+; ALL-NEXT: add x8, x8, #32
+; ALL-NEXT: stp q0, q1, [sp, #16]
+; ALL-NEXT: str q0, [sp]
+; ALL-NEXT: sub x8, x8, x9
; ALL-NEXT: ldp x9, x10, [x8, #16]
; ALL-NEXT: ldr q0, [x8]
; ALL-NEXT: str q0, [x2]
@@ -225,12 +315,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
%res = shl i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
+
define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: ashr_32bytes:
; ALL: // %bb.0:
@@ -238,14 +329,59 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldp x9, x8, [x0, #16]
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q0, [x0]
-; ALL-NEXT: and x10, x10, #0x1f
+; ALL-NEXT: and x11, x10, #0x18
; ALL-NEXT: stp x9, x8, [sp, #16]
; ALL-NEXT: asr x8, x8, #63
; ALL-NEXT: mov x9, sp
; ALL-NEXT: str q0, [sp]
+; ALL-NEXT: add x9, x9, x11
+; ALL-NEXT: stp x8, x8, [sp, #48]
+; ALL-NEXT: stp x8, x8, [sp, #32]
+; ALL-NEXT: lsl x8, x10, #3
+; ALL-NEXT: ldp x11, x10, [x9, #16]
+; ALL-NEXT: ldp x9, x12, [x9]
+; ALL-NEXT: mvn w13, w8
+; ALL-NEXT: and x8, x8, #0x38
+; ALL-NEXT: lsl x14, x10, #1
+; ALL-NEXT: lsl x15, x11, #1
+; ALL-NEXT: lsr x11, x11, x8
+; ALL-NEXT: lsl x16, x12, #1
+; ALL-NEXT: asr x10, x10, x8
+; ALL-NEXT: lsr x12, x12, x8
+; ALL-NEXT: lsl x14, x14, x13
+; ALL-NEXT: lsr x8, x9, x8
+; ALL-NEXT: lsl x9, x16, x13
+; ALL-NEXT: lsl x13, x15, x13
+; ALL-NEXT: orr x11, x14, x11
+; ALL-NEXT: orr x8, x9, x8
+; ALL-NEXT: orr x9, x12, x13
+; ALL-NEXT: stp x11, x10, [x2, #16]
+; ALL-NEXT: stp x8, x9, [x2]
+; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: ashr_32bytes_dwordOff:
+; ALL: // %bb.0:
+; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: ldp x9, x8, [x0, #16]
+; ALL-NEXT: ldr x10, [x1]
+; ALL-NEXT: ldr q0, [x0]
+; ALL-NEXT: stp x9, x8, [sp, #16]
+; ALL-NEXT: asr x8, x8, #63
+; ALL-NEXT: ubfiz x9, x10, #3, #2
+; ALL-NEXT: mov x10, sp
+; ALL-NEXT: str q0, [sp]
; ALL-NEXT: stp x8, x8, [sp, #48]
; ALL-NEXT: stp x8, x8, [sp, #32]
-; ALL-NEXT: add x8, x9, x10
+; ALL-NEXT: add x8, x10, x9
; ALL-NEXT: ldp x10, x9, [x8, #16]
; ALL-NEXT: ldr q0, [x8]
; ALL-NEXT: str q0, [x2]
@@ -253,8 +389,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
%res = ashr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
index a4da6db57ecae3..531e0fa740da78 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
@@ -160,30 +160,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q1, [x0]
; ALL-NEXT: stp x9, x8, [sp, #16]
-; ALL-NEXT: ubfx x8, x10, #3, #5
+; ALL-NEXT: lsr x8, x10, #3
; ALL-NEXT: mov x9, sp
; ALL-NEXT: str q1, [sp]
-; ALL-NEXT: and x10, x10, #0x7
+; ALL-NEXT: and x12, x10, #0x3f
+; ALL-NEXT: and x8, x8, #0x18
; ALL-NEXT: stp q0, q0, [sp, #32]
+; ALL-NEXT: eor x12, x12, #0x3f
; ALL-NEXT: add x8, x9, x8
-; ALL-NEXT: mvn w13, w10
-; ALL-NEXT: ldp x11, x9, [x8, #16]
-; ALL-NEXT: ldp x8, x12, [x8]
+; ALL-NEXT: ldp x13, x11, [x8]
+; ALL-NEXT: ldr x9, [x8, #24]
+; ALL-NEXT: ldr x8, [x8, #16]
; ALL-NEXT: lsl x14, x9, #1
+; ALL-NEXT: lsr x9, x9, x10
; ALL-NEXT: lsl x15, x11, #1
; ALL-NEXT: lsr x11, x11, x10
-; ALL-NEXT: lsl x16, x12, #1
-; ALL-NEXT: lsr x9, x9, x10
-; ALL-NEXT: lsr x12, x12, x10
-; ALL-NEXT: lsl x14, x14, x13
+; ALL-NEXT: lsr x13, x13, x10
+; ALL-NEXT: lsl x14, x14, x12
+; ALL-NEXT: lsl x12, x15, x12
+; ALL-NEXT: lsl x15, x8, #1
; ALL-NEXT: lsr x8, x8, x10
-; ALL-NEXT: lsl x10, x16, x13
-; ALL-NEXT: lsl x13, x15, x13
-; ALL-NEXT: orr x11, x14, x11
-; ALL-NEXT: stp x11, x9, [x2, #16]
-; ALL-NEXT: orr x8, x10, x8
+; ALL-NEXT: mvn w10, w10
+; ALL-NEXT: lsl x10, x15, x10
+; ALL-NEXT: orr x8, x14, x8
+; ALL-NEXT: stp x8, x9, [x2, #16]
; ALL-NEXT: orr x9, x12, x13
-; ALL-NEXT: stp x8, x9, [x2]
+; ALL-NEXT: orr x8, x11, x10
+; ALL-NEXT: stp x9, x8, [x2]
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
@@ -201,31 +204,34 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q1, [x0]
; ALL-NEXT: stp x9, x8, [sp, #48]
-; ALL-NEXT: mov x8, sp
-; ALL-NEXT: ubfx x9, x10, #3, #5
-; ALL-NEXT: add x8, x8, #32
+; ALL-NEXT: lsr x8, x10, #3
+; ALL-NEXT: mov x9, sp
+; ALL-NEXT: add x9, x9, #32
; ALL-NEXT: stp q0, q1, [sp, #16]
-; ALL-NEXT: and x10, x10, #0x7
+; ALL-NEXT: and x12, x10, #0x3f
+; ALL-NEXT: and x8, x8, #0x18
; ALL-NEXT: str q0, [sp]
-; ALL-NEXT: sub x8, x8, x9
-; ALL-NEXT: mvn w13, w10
-; ALL-NEXT: ldp x9, x11, [x8]
-; ALL-NEXT: ldp x12, x8, [x8, #16]
-; ALL-NEXT: lsr x14, x9, #1
-; ALL-NEXT: lsr x15, x11, #1
-; ALL-NEXT: lsl x11, x11, x10
-; ALL-NEXT: lsr x16, x12, #1
+; ALL-NEXT: eor x12, x12, #0x3f
+; ALL-NEXT: sub x8, x9, x8
+; ALL-NEXT: ldp x11, x13, [x8, #16]
+; ALL-NEXT: ldr x9, [x8]
+; ALL-NEXT: ldr x8, [x8, #8]
+; ALL-NEXT: lsr x15, x9, #1
; ALL-NEXT: lsl x9, x9, x10
-; ALL-NEXT: lsl x12, x12, x10
-; ALL-NEXT: lsr x14, x14, x13
+; ALL-NEXT: lsr x14, x11, #1
+; ALL-NEXT: lsl x11, x11, x10
+; ALL-NEXT: lsl x13, x13, x10
+; ALL-NEXT: lsr x14, x14, x12
+; ALL-NEXT: lsr x12, x15, x12
+; ALL-NEXT: lsr x15, x8, #1
; ALL-NEXT: lsl x8, x8, x10
-; ALL-NEXT: lsr x10, x16, x13
-; ALL-NEXT: lsr x13, x15, x13
-; ALL-NEXT: orr x11, x11, x14
-; ALL-NEXT: stp x9, x11, [x2]
-; ALL-NEXT: orr x8, x8, x10
-; ALL-NEXT: orr x9, x12, x13
-; ALL-NEXT: stp x9, x8, [x2, #16]
+; ALL-NEXT: mvn w10, w10
+; ALL-NEXT: lsr x10, x15, x10
+; ALL-NEXT: orr x8, x8, x12
+; ALL-NEXT: stp x9, x8, [x2]
+; ALL-NEXT: orr x9, x13, x14
+; ALL-NEXT: orr x8, x11, x10
+; ALL-NEXT: stp x8, x9, [x2, #16]
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
@@ -243,31 +249,34 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q0, [x0]
; ALL-NEXT: stp x9, x8, [sp, #16]
+; ALL-NEXT: lsr x9, x10, #3
; ALL-NEXT: asr x8, x8, #63
-; ALL-NEXT: ubfx x9, x10, #3, #5
; ALL-NEXT: str q0, [sp]
-; ALL-NEXT: and x10, x10, #0x7
+; ALL-NEXT: and x12, x10, #0x3f
+; ALL-NEXT: and x9, x9, #0x18
; ALL-NEXT: stp x8, x8, [sp, #48]
-; ALL-NEXT: add x9, x11, x9
-; ALL-NEXT: mvn w13, w10
+; ALL-NEXT: eor x12, x12, #0x3f
; ALL-NEXT: stp x8, x8, [sp, #32]
-; ALL-NEXT: ldp x11, x8, [x9, #16]
-; ALL-NEXT: ldp x9, x12, [x9]
-; ALL-NEXT: lsl x14, x8, #1
+; ALL-NEXT: add x8, x11, x9
+; ALL-NEXT: ldp x13, x11, [x8]
+; ALL-NEXT: ldr x9, [x8, #24]
+; ALL-NEXT: ldr x8, [x8, #16]
+; ALL-NEXT: lsl x14, x9, #1
+; ALL-NEXT: asr x9, x9, x10
; ALL-NEXT: lsl x15, x11, #1
; ALL-NEXT: lsr x11, x11, x10
-; ALL-NEXT: lsl x16, x12, #1
-; ALL-NEXT: asr x8, x8, x10
-; ALL-NEXT: lsr x12, x12, x10
-; ALL-NEXT: lsl x14, x14, x13
-; ALL-NEXT: lsr x9, x9, x10
-; ALL-NEXT: lsl x10, x16, x13
-; ALL-NEXT: lsl x13, x15, x13
-; ALL-NEXT: orr x11, x14, x11
-; ALL-NEXT: stp x11, x8, [x2, #16]
-; ALL-NEXT: orr x8, x10, x9
+; ALL-NEXT: lsr x13, x13, x10
+; ALL-NEXT: lsl x14, x14, x12
+; ALL-NEXT: lsl x12, x15, x12
+; ALL-NEXT: lsl x15, x8, #1
+; ALL-NEXT: lsr x8, x8, x10
+; ALL-NEXT: mvn w10, w10
+; ALL-NEXT: lsl x10, x15, x10
+; ALL-NEXT: orr x8, x14, x8
+; ALL-NEXT: stp x8, x9, [x2, #16]
; ALL-NEXT: orr x9, x12, x13
-; ALL-NEXT: stp x8, x9, [x2]
+; ALL-NEXT: orr x8, x11, x10
+; ALL-NEXT: stp x9, x8, [x2]
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
index 450fe968d4917c..2b8129acb91fce 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -382,53 +382,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MIPS: # %bb.0: # %entry
; MIPS-NEXT: addiu $sp, $sp, -32
; MIPS-NEXT: .cfi_def_cfa_offset 32
-; MIPS-NEXT: swl $7, 28($sp)
-; MIPS-NEXT: swl $6, 24($sp)
; MIPS-NEXT: sra $1, $4, 31
-; MIPS-NEXT: swl $5, 20($sp)
-; MIPS-NEXT: swl $4, 16($sp)
-; MIPS-NEXT: swl $1, 12($sp)
-; MIPS-NEXT: swl $1, 8($sp)
-; MIPS-NEXT: swl $1, 4($sp)
-; MIPS-NEXT: swl $1, 0($sp)
-; MIPS-NEXT: addiu $2, $sp, 0
-; MIPS-NEXT: swr $7, 31($sp)
-; MIPS-NEXT: swr $6, 27($sp)
-; MIPS-NEXT: swr $5, 23($sp)
-; MIPS-NEXT: swr $4, 19($sp)
-; MIPS-NEXT: swr $1, 15($sp)
-; MIPS-NEXT: swr $1, 11($sp)
-; MIPS-NEXT: swr $1, 7($sp)
-; MIPS-NEXT: swr $1, 3($sp)
-; MIPS-NEXT: addiu $1, $2, 16
+; MIPS-NEXT: sw $7, 28($sp)
+; MIPS-NEXT: sw $6, 24($sp)
+; MIPS-NEXT: sw $5, 20($sp)
+; MIPS-NEXT: sw $4, 16($sp)
+; MIPS-NEXT: sw $1, 12($sp)
+; MIPS-NEXT: sw $1, 8($sp)
+; MIPS-NEXT: sw $1, 4($sp)
+; MIPS-NEXT: sw $1, 0($sp)
+; MIPS-NEXT: addiu $1, $sp, 0
+; MIPS-NEXT: addiu $1, $1, 16
; MIPS-NEXT: lw $2, 60($sp)
; MIPS-NEXT: srl $3, $2, 3
-; MIPS-NEXT: andi $3, $3, 15
+; MIPS-NEXT: andi $3, $3, 12
; MIPS-NEXT: subu $1, $1, $3
-; MIPS-NEXT: lwl $3, 4($1)
-; MIPS-NEXT: lwr $3, 7($1)
-; MIPS-NEXT: sll $4, $3, 1
-; MIPS-NEXT: lwl $5, 8($1)
-; MIPS-NEXT: lwr $5, 11($1)
-; MIPS-NEXT: andi $2, $2, 7
-; MIPS-NEXT: not $6, $2
-; MIPS-NEXT: srlv $7, $5, $2
-; MIPS-NEXT: sllv $4, $4, $6
+; MIPS-NEXT: lw $3, 4($1)
+; MIPS-NEXT: lw $5, 8($1)
+; MIPS-NEXT: srlv $4, $5, $2
+; MIPS-NEXT: sll $6, $3, 1
+; MIPS-NEXT: andi $7, $2, 31
+; MIPS-NEXT: xori $7, $7, 31
+; MIPS-NEXT: sllv $6, $6, $7
; MIPS-NEXT: srlv $3, $3, $2
-; MIPS-NEXT: lwl $6, 0($1)
-; MIPS-NEXT: lwr $6, 3($1)
-; MIPS-NEXT: sll $8, $6, 1
-; MIPS-NEXT: xori $9, $2, 31
-; MIPS-NEXT: sllv $8, $8, $9
-; MIPS-NEXT: or $3, $3, $8
-; MIPS-NEXT: or $4, $7, $4
-; MIPS-NEXT: lwl $7, 12($1)
-; MIPS-NEXT: lwr $7, 15($1)
-; MIPS-NEXT: srlv $1, $7, $2
+; MIPS-NEXT: lw $8, 0($1)
+; MIPS-NEXT: sll $9, $8, 1
+; MIPS-NEXT: sllv $9, $9, $7
+; MIPS-NEXT: or $3, $3, $9
+; MIPS-NEXT: or $4, $4, $6
+; MIPS-NEXT: lw $1, 12($1)
+; MIPS-NEXT: srlv $1, $1, $2
; MIPS-NEXT: sll $5, $5, 1
-; MIPS-NEXT: sllv $5, $5, $9
+; MIPS-NEXT: sllv $5, $5, $7
; MIPS-NEXT: or $5, $1, $5
-; MIPS-NEXT: srav $2, $6, $2
+; MIPS-NEXT: srav $2, $8, $2
; MIPS-NEXT: jr $ra
; MIPS-NEXT: addiu $sp, $sp, 32
;
@@ -436,53 +423,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MIPS32: # %bb.0: # %entry
; MIPS32-NEXT: addiu $sp, $sp, -32
; MIPS32-NEXT: .cfi_def_cfa_offset 32
-; MIPS32-NEXT: swl $7, 28($sp)
-; MIPS32-NEXT: swl $6, 24($sp)
; MIPS32-NEXT: sra $1, $4, 31
-; MIPS32-NEXT: swl $5, 20($sp)
-; MIPS32-NEXT: swl $4, 16($sp)
-; MIPS32-NEXT: swl $1, 12($sp)
-; MIPS32-NEXT: swl $1, 8($sp)
-; MIPS32-NEXT: swl $1, 4($sp)
-; MIPS32-NEXT: swl $1, 0($sp)
-; MIPS32-NEXT: addiu $2, $sp, 0
-; MIPS32-NEXT: swr $7, 31($sp)
-; MIPS32-NEXT: swr $6, 27($sp)
-; MIPS32-NEXT: swr $5, 23($sp)
-; MIPS32-NEXT: swr $4, 19($sp)
-; MIPS32-NEXT: swr $1, 15($sp)
-; MIPS32-NEXT: swr $1, 11($sp)
-; MIPS32-NEXT: swr $1, 7($sp)
-; MIPS32-NEXT: swr $1, 3($sp)
-; MIPS32-NEXT: addiu $1, $2, 16
+; MIPS32-NEXT: sw $7, 28($sp)
+; MIPS32-NEXT: sw $6, 24($sp)
+; MIPS32-NEXT: sw $5, 20($sp)
+; MIPS32-NEXT: sw $4, 16($sp)
+; MIPS32-NEXT: sw $1, 12($sp)
+; MIPS32-NEXT: sw $1, 8($sp)
+; MIPS32-NEXT: sw $1, 4($sp)
+; MIPS32-NEXT: sw $1, 0($sp)
+; MIPS32-NEXT: addiu $1, $sp, 0
+; MIPS32-NEXT: addiu $1, $1, 16
; MIPS32-NEXT: lw $2, 60($sp)
; MIPS32-NEXT: srl $3, $2, 3
-; MIPS32-NEXT: andi $3, $3, 15
+; MIPS32-NEXT: andi $3, $3, 12
; MIPS32-NEXT: subu $1, $1, $3
-; MIPS32-NEXT: lwl $3, 4($1)
-; MIPS32-NEXT: lwr $3, 7($1)
-; MIPS32-NEXT: sll $4, $3, 1
-; MIPS32-NEXT: lwl $5, 8($1)
-; MIPS32-NEXT: lwr $5, 11($1)
-; MIPS32-NEXT: andi $2, $2, 7
-; MIPS32-NEXT: not $6, $2
-; MIPS32-NEXT: srlv $7, $5, $2
-; MIPS32-NEXT: sllv $4, $4, $6
+; MIPS32-NEXT: lw $3, 4($1)
+; MIPS32-NEXT: lw $5, 8($1)
+; MIPS32-NEXT: srlv $4, $5, $2
+; MIPS32-NEXT: sll $6, $3, 1
+; MIPS32-NEXT: andi $7, $2, 31
+; MIPS32-NEXT: xori $7, $7, 31
+; MIPS32-NEXT: sllv $6, $6, $7
; MIPS32-NEXT: srlv $3, $3, $2
-; MIPS32-NEXT: lwl $6, 0($1)
-; MIPS32-NEXT: lwr $6, 3($1)
-; MIPS32-NEXT: sll $8, $6, 1
-; MIPS32-NEXT: xori $9, $2, 31
-; MIPS32-NEXT: sllv $8, $8, $9
-; MIPS32-NEXT: or $3, $3, $8
-; MIPS32-NEXT: or $4, $7, $4
-; MIPS32-NEXT: lwl $7, 12($1)
-; MIPS32-NEXT: lwr $7, 15($1)
-; MIPS32-NEXT: srlv $1, $7, $2
+; MIPS32-NEXT: lw $8, 0($1)
+; MIPS32-NEXT: sll $9, $8, 1
+; MIPS32-NEXT: sllv $9, $9, $7
+; MIPS32-NEXT: or $3, $3, $9
+; MIPS32-NEXT: or $4, $4, $6
+; MIPS32-NEXT: lw $1, 12($1)
+; MIPS32-NEXT: srlv $1, $1, $2
; MIPS32-NEXT: sll $5, $5, 1
-; MIPS32-NEXT: sllv $5, $5, $9
+; MIPS32-NEXT: sllv $5, $5, $7
; MIPS32-NEXT: or $5, $1, $5
-; MIPS32-NEXT: srav $2, $6, $2
+; MIPS32-NEXT: srav $2, $8, $2
; MIPS32-NEXT: jr $ra
; MIPS32-NEXT: addiu $sp, $sp, 32
;
@@ -490,52 +464,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; 32R2: # %bb.0: # %entry
; 32R2-NEXT: addiu $sp, $sp, -32
; 32R2-NEXT: .cfi_def_cfa_offset 32
-; 32R2-NEXT: swl $7, 28($sp)
-; 32R2-NEXT: swl $6, 24($sp)
-; 32R2-NEXT: swl $5, 20($sp)
; 32R2-NEXT: sra $1, $4, 31
-; 32R2-NEXT: swl $4, 16($sp)
-; 32R2-NEXT: swl $1, 12($sp)
-; 32R2-NEXT: swl $1, 8($sp)
-; 32R2-NEXT: swl $1, 4($sp)
-; 32R2-NEXT: swl $1, 0($sp)
-; 32R2-NEXT: swr $7, 31($sp)
-; 32R2-NEXT: swr $6, 27($sp)
-; 32R2-NEXT: swr $5, 23($sp)
-; 32R2-NEXT: swr $4, 19($sp)
-; 32R2-NEXT: swr $1, 15($sp)
-; 32R2-NEXT: swr $1, 11($sp)
-; 32R2-NEXT: swr $1, 7($sp)
-; 32R2-NEXT: swr $1, 3($sp)
+; 32R2-NEXT: sw $7, 28($sp)
+; 32R2-NEXT: sw $6, 24($sp)
+; 32R2-NEXT: sw $5, 20($sp)
+; 32R2-NEXT: sw $4, 16($sp)
+; 32R2-NEXT: sw $1, 12($sp)
+; 32R2-NEXT: sw $1, 8($sp)
+; 32R2-NEXT: sw $1, 4($sp)
+; 32R2-NEXT: sw $1, 0($sp)
; 32R2-NEXT: addiu $1, $sp, 0
; 32R2-NEXT: addiu $1, $1, 16
; 32R2-NEXT: lw $2, 60($sp)
-; 32R2-NEXT: ext $3, $2, 3, 4
+; 32R2-NEXT: srl $3, $2, 3
+; 32R2-NEXT: andi $3, $3, 12
; 32R2-NEXT: subu $1, $1, $3
-; 32R2-NEXT: lwl $3, 4($1)
-; 32R2-NEXT: lwr $3, 7($1)
-; 32R2-NEXT: sll $4, $3, 1
-; 32R2-NEXT: lwl $5, 8($1)
-; 32R2-NEXT: lwr $5, 11($1)
-; 32R2-NEXT: andi $2, $2, 7
-; 32R2-NEXT: not $6, $2
-; 32R2-NEXT: srlv $7, $5, $2
-; 32R2-NEXT: sllv $4, $4, $6
+; 32R2-NEXT: lw $3, 4($1)
+; 32R2-NEXT: lw $5, 8($1)
+; 32R2-NEXT: srlv $4, $5, $2
+; 32R2-NEXT: sll $6, $3, 1
+; 32R2-NEXT: andi $7, $2, 31
+; 32R2-NEXT: xori $7, $7, 31
+; 32R2-NEXT: sllv $6, $6, $7
; 32R2-NEXT: srlv $3, $3, $2
-; 32R2-NEXT: lwl $6, 0($1)
-; 32R2-NEXT: lwr $6, 3($1)
-; 32R2-NEXT: sll $8, $6, 1
-; 32R2-NEXT: xori $9, $2, 31
-; 32R2-NEXT: sllv $8, $8, $9
-; 32R2-NEXT: or $3, $3, $8
-; 32R2-NEXT: or $4, $7, $4
-; 32R2-NEXT: lwl $7, 12($1)
-; 32R2-NEXT: lwr $7, 15($1)
-; 32R2-NEXT: srlv $1, $7, $2
+; 32R2-NEXT: lw $8, 0($1)
+; 32R2-NEXT: sll $9, $8, 1
+; 32R2-NEXT: sllv $9, $9, $7
+; 32R2-NEXT: or $3, $3, $9
+; 32R2-NEXT: or $4, $4, $6
+; 32R2-NEXT: lw $1, 12($1)
+; 32R2-NEXT: srlv $1, $1, $2
; 32R2-NEXT: sll $5, $5, 1
-; 32R2-NEXT: sllv $5, $5, $9
+; 32R2-NEXT: sllv $5, $5, $7
; 32R2-NEXT: or $5, $1, $5
-; 32R2-NEXT: srav $2, $6, $2
+; 32R2-NEXT: srav $2, $8, $2
; 32R2-NEXT: jr $ra
; 32R2-NEXT: addiu $sp, $sp, 32
;
@@ -555,28 +517,28 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; 32R6-NEXT: addiu $1, $sp, 0
; 32R6-NEXT: addiu $1, $1, 16
; 32R6-NEXT: lw $2, 60($sp)
-; 32R6-NEXT: ext $3, $2, 3, 4
+; 32R6-NEXT: srl $3, $2, 3
+; 32R6-NEXT: andi $3, $3, 12
; 32R6-NEXT: subu $1, $1, $3
; 32R6-NEXT: lw $3, 4($1)
-; 32R6-NEXT: sll $4, $3, 1
; 32R6-NEXT: lw $5, 8($1)
-; 32R6-NEXT: andi $2, $2, 7
-; 32R6-NEXT: not $6, $2
-; 32R6-NEXT: srlv $7, $5, $2
-; 32R6-NEXT: sllv $4, $4, $6
+; 32R6-NEXT: srlv $4, $5, $2
+; 32R6-NEXT: sll $6, $3, 1
+; 32R6-NEXT: andi $7, $2, 31
+; 32R6-NEXT: xori $7, $7, 31
+; 32R6-NEXT: sllv $6, $6, $7
; 32R6-NEXT: srlv $3, $3, $2
-; 32R6-NEXT: lw $6, 0($1)
-; 32R6-NEXT: sll $8, $6, 1
-; 32R6-NEXT: xori $9, $2, 31
-; 32R6-NEXT: sllv $8, $8, $9
-; 32R6-NEXT: or $3, $3, $8
-; 32R6-NEXT: or $4, $7, $4
+; 32R6-NEXT: lw $8, 0($1)
+; 32R6-NEXT: sll $9, $8, 1
+; 32R6-NEXT: sllv $9, $9, $7
+; 32R6-NEXT: or $3, $3, $9
+; 32R6-NEXT: or $4, $4, $6
; 32R6-NEXT: lw $1, 12($1)
; 32R6-NEXT: srlv $1, $1, $2
; 32R6-NEXT: sll $5, $5, 1
-; 32R6-NEXT: sllv $5, $5, $9
+; 32R6-NEXT: sllv $5, $5, $7
; 32R6-NEXT: or $5, $1, $5
-; 32R6-NEXT: srav $2, $6, $2
+; 32R6-NEXT: srav $2, $8, $2
; 32R6-NEXT: jr $ra
; 32R6-NEXT: addiu $sp, $sp, 32
;
@@ -656,53 +618,37 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MMR3-NEXT: swp $16, 32($sp)
; MMR3-NEXT: .cfi_offset 17, -4
; MMR3-NEXT: .cfi_offset 16, -8
-; MMR3-NEXT: swl $7, 28($sp)
-; MMR3-NEXT: swl $6, 24($sp)
-; MMR3-NEXT: swl $5, 20($sp)
; MMR3-NEXT: sra $1, $4, 31
-; MMR3-NEXT: swl $4, 16($sp)
-; MMR3-NEXT: swl $1, 12($sp)
-; MMR3-NEXT: swl $1, 8($sp)
-; MMR3-NEXT: swl $1, 4($sp)
-; MMR3-NEXT: swl $1, 0($sp)
-; MMR3-NEXT: swr $7, 31($sp)
-; MMR3-NEXT: swr $6, 27($sp)
-; MMR3-NEXT: swr $5, 23($sp)
-; MMR3-NEXT: swr $4, 19($sp)
-; MMR3-NEXT: swr $1, 15($sp)
-; MMR3-NEXT: swr $1, 11($sp)
-; MMR3-NEXT: swr $1, 7($sp)
-; MMR3-NEXT: swr $1, 3($sp)
+; MMR3-NEXT: swp $6, 24($sp)
+; MMR3-NEXT: swp $4, 16($sp)
+; MMR3-NEXT: sw $1, 12($sp)
+; MMR3-NEXT: sw $1, 8($sp)
+; MMR3-NEXT: sw $1, 4($sp)
+; MMR3-NEXT: sw $1, 0($sp)
; MMR3-NEXT: addiur1sp $2, 0
; MMR3-NEXT: addiur2 $2, $2, 16
; MMR3-NEXT: lw $3, 68($sp)
-; MMR3-NEXT: ext $4, $3, 3, 4
-; MMR3-NEXT: subu16 $2, $2, $4
-; MMR3-NEXT: lwl $7, 4($2)
-; MMR3-NEXT: lwr $7, 7($2)
-; MMR3-NEXT: sll16 $4, $7, 1
-; MMR3-NEXT: lwl $5, 8($2)
-; MMR3-NEXT: lwr $5, 11($2)
-; MMR3-NEXT: andi16 $6, $3, 7
-; MMR3-NEXT: not16 $3, $6
-; MMR3-NEXT: andi16 $3, $3, 31
-; MMR3-NEXT: srlv $16, $5, $6
-; MMR3-NEXT: sllv $4, $4, $3
-; MMR3-NEXT: srlv $17, $7, $6
-; MMR3-NEXT: lwl $7, 0($2)
-; MMR3-NEXT: lwr $7, 3($2)
-; MMR3-NEXT: sll16 $3, $7, 1
-; MMR3-NEXT: xori $1, $6, 31
+; MMR3-NEXT: srl16 $4, $3, 3
+; MMR3-NEXT: andi $4, $4, 12
+; MMR3-NEXT: subu16 $5, $2, $4
+; MMR3-NEXT: lwp $6, 4($5)
+; MMR3-NEXT: andi16 $2, $3, 31
+; MMR3-NEXT: srlv $16, $7, $2
+; MMR3-NEXT: sll16 $3, $6, 1
+; MMR3-NEXT: xori $1, $2, 31
+; MMR3-NEXT: sllv $4, $3, $1
+; MMR3-NEXT: srlv $6, $6, $2
+; MMR3-NEXT: lw16 $17, 0($5)
+; MMR3-NEXT: sll16 $3, $17, 1
; MMR3-NEXT: sllv $3, $3, $1
-; MMR3-NEXT: or16 $3, $17
+; MMR3-NEXT: or16 $3, $6
; MMR3-NEXT: or16 $4, $16
-; MMR3-NEXT: lwl $8, 12($2)
-; MMR3-NEXT: lwr $8, 15($2)
-; MMR3-NEXT: srlv $2, $8, $6
-; MMR3-NEXT: sll16 $5, $5, 1
+; MMR3-NEXT: lw16 $5, 12($5)
+; MMR3-NEXT: srlv $6, $5, $2
+; MMR3-NEXT: sll16 $5, $7, 1
; MMR3-NEXT: sllv $5, $5, $1
-; MMR3-NEXT: or16 $5, $2
-; MMR3-NEXT: srav $2, $7, $6
+; MMR3-NEXT: or16 $5, $6
+; MMR3-NEXT: srav $2, $17, $2
; MMR3-NEXT: lwp $16, 32($sp)
; MMR3-NEXT: addiusp 40
; MMR3-NEXT: jrc $ra
@@ -714,40 +660,39 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill
; MMR6-NEXT: .cfi_offset 16, -4
; MMR6-NEXT: sra $1, $4, 31
-; MMR6-NEXT: sw $7, 32($sp)
-; MMR6-NEXT: sw $6, 28($sp)
-; MMR6-NEXT: sw $5, 24($sp)
-; MMR6-NEXT: sw $4, 20($sp)
-; MMR6-NEXT: sw $1, 16($sp)
+; MMR6-NEXT: sw $7, 28($sp)
+; MMR6-NEXT: sw $6, 24($sp)
+; MMR6-NEXT: sw $5, 20($sp)
+; MMR6-NEXT: sw $4, 16($sp)
; MMR6-NEXT: sw $1, 12($sp)
; MMR6-NEXT: sw $1, 8($sp)
; MMR6-NEXT: sw $1, 4($sp)
-; MMR6-NEXT: addiu $2, $sp, 4
+; MMR6-NEXT: sw $1, 0($sp)
+; MMR6-NEXT: addiu $2, $sp, 0
; MMR6-NEXT: addiur2 $2, $2, 16
; MMR6-NEXT: lw $3, 68($sp)
-; MMR6-NEXT: ext $4, $3, 3, 4
-; MMR6-NEXT: subu16 $5, $2, $4
-; MMR6-NEXT: lw16 $4, 4($5)
-; MMR6-NEXT: sll16 $6, $4, 1
-; MMR6-NEXT: lw16 $7, 8($5)
-; MMR6-NEXT: andi16 $2, $3, 7
-; MMR6-NEXT: not16 $3, $2
-; MMR6-NEXT: andi16 $3, $3, 31
-; MMR6-NEXT: srlv $1, $7, $2
-; MMR6-NEXT: sllv $6, $6, $3
-; MMR6-NEXT: srlv $3, $4, $2
-; MMR6-NEXT: lw16 $16, 0($5)
+; MMR6-NEXT: srl16 $4, $3, 3
+; MMR6-NEXT: andi $4, $4, 12
+; MMR6-NEXT: subu16 $2, $2, $4
+; MMR6-NEXT: lw16 $4, 4($2)
+; MMR6-NEXT: lw16 $5, 8($2)
+; MMR6-NEXT: andi16 $6, $3, 31
+; MMR6-NEXT: srlv $1, $5, $6
+; MMR6-NEXT: sll16 $3, $4, 1
+; MMR6-NEXT: xori $7, $6, 31
+; MMR6-NEXT: sllv $8, $3, $7
+; MMR6-NEXT: srlv $3, $4, $6
+; MMR6-NEXT: lw16 $16, 0($2)
; MMR6-NEXT: sll16 $4, $16, 1
-; MMR6-NEXT: xori $8, $2, 31
-; MMR6-NEXT: sllv $4, $4, $8
+; MMR6-NEXT: sllv $4, $4, $7
; MMR6-NEXT: or $3, $3, $4
-; MMR6-NEXT: or $4, $1, $6
-; MMR6-NEXT: lw16 $5, 12($5)
-; MMR6-NEXT: srlv $1, $5, $2
-; MMR6-NEXT: sll16 $5, $7, 1
-; MMR6-NEXT: sllv $5, $5, $8
-; MMR6-NEXT: or $5, $1, $5
-; MMR6-NEXT: srav $2, $16, $2
+; MMR6-NEXT: or $4, $1, $8
+; MMR6-NEXT: lw16 $2, 12($2)
+; MMR6-NEXT: srlv $1, $2, $6
+; MMR6-NEXT: sll16 $2, $5, 1
+; MMR6-NEXT: sllv $2, $2, $7
+; MMR6-NEXT: or $5, $1, $2
+; MMR6-NEXT: srav $2, $16, $6
; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload
; MMR6-NEXT: addiu $sp, $sp, 40
; MMR6-NEXT: jrc $ra
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
index 03cf104e3120c4..69b842c73db1b4 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -398,52 +398,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS2: # %bb.0: # %entry
; MIPS2-NEXT: addiu $sp, $sp, -32
; MIPS2-NEXT: .cfi_def_cfa_offset 32
-; MIPS2-NEXT: swl $7, 28($sp)
-; MIPS2-NEXT: swl $6, 24($sp)
-; MIPS2-NEXT: swl $5, 20($sp)
-; MIPS2-NEXT: swl $4, 16($sp)
-; MIPS2-NEXT: swl $zero, 12($sp)
-; MIPS2-NEXT: swl $zero, 8($sp)
-; MIPS2-NEXT: swl $zero, 4($sp)
-; MIPS2-NEXT: swl $zero, 0($sp)
; MIPS2-NEXT: addiu $1, $sp, 0
-; MIPS2-NEXT: swr $7, 31($sp)
-; MIPS2-NEXT: swr $6, 27($sp)
-; MIPS2-NEXT: swr $5, 23($sp)
-; MIPS2-NEXT: swr $4, 19($sp)
-; MIPS2-NEXT: swr $zero, 15($sp)
-; MIPS2-NEXT: swr $zero, 11($sp)
-; MIPS2-NEXT: swr $zero, 7($sp)
-; MIPS2-NEXT: swr $zero, 3($sp)
+; MIPS2-NEXT: sw $7, 28($sp)
+; MIPS2-NEXT: sw $6, 24($sp)
+; MIPS2-NEXT: sw $5, 20($sp)
+; MIPS2-NEXT: sw $4, 16($sp)
; MIPS2-NEXT: addiu $1, $1, 16
; MIPS2-NEXT: lw $2, 60($sp)
; MIPS2-NEXT: srl $3, $2, 3
-; MIPS2-NEXT: andi $3, $3, 15
+; MIPS2-NEXT: andi $3, $3, 12
; MIPS2-NEXT: subu $1, $1, $3
-; MIPS2-NEXT: lwl $3, 4($1)
-; MIPS2-NEXT: lwr $3, 7($1)
-; MIPS2-NEXT: sll $4, $3, 1
-; MIPS2-NEXT: lwl $5, 8($1)
-; MIPS2-NEXT: lwr $5, 11($1)
-; MIPS2-NEXT: andi $2, $2, 7
-; MIPS2-NEXT: not $6, $2
-; MIPS2-NEXT: srlv $7, $5, $2
-; MIPS2-NEXT: sllv $4, $4, $6
+; MIPS2-NEXT: sw $zero, 12($sp)
+; MIPS2-NEXT: sw $zero, 8($sp)
+; MIPS2-NEXT: sw $zero, 4($sp)
+; MIPS2-NEXT: sw $zero, 0($sp)
+; MIPS2-NEXT: lw $3, 4($1)
+; MIPS2-NEXT: lw $5, 8($1)
+; MIPS2-NEXT: srlv $4, $5, $2
+; MIPS2-NEXT: sll $6, $3, 1
+; MIPS2-NEXT: andi $7, $2, 31
+; MIPS2-NEXT: xori $7, $7, 31
+; MIPS2-NEXT: sllv $6, $6, $7
; MIPS2-NEXT: srlv $3, $3, $2
-; MIPS2-NEXT: lwl $6, 0($1)
-; MIPS2-NEXT: lwr $6, 3($1)
-; MIPS2-NEXT: sll $8, $6, 1
-; MIPS2-NEXT: xori $9, $2, 31
-; MIPS2-NEXT: sllv $8, $8, $9
-; MIPS2-NEXT: or $3, $3, $8
-; MIPS2-NEXT: or $4, $7, $4
-; MIPS2-NEXT: lwl $7, 12($1)
-; MIPS2-NEXT: lwr $7, 15($1)
-; MIPS2-NEXT: srlv $1, $7, $2
+; MIPS2-NEXT: lw $8, 0($1)
+; MIPS2-NEXT: sll $9, $8, 1
+; MIPS2-NEXT: sllv $9, $9, $7
+; MIPS2-NEXT: or $3, $3, $9
+; MIPS2-NEXT: or $4, $4, $6
+; MIPS2-NEXT: lw $1, 12($1)
+; MIPS2-NEXT: srlv $1, $1, $2
; MIPS2-NEXT: sll $5, $5, 1
-; MIPS2-NEXT: sllv $5, $5, $9
+; MIPS2-NEXT: sllv $5, $5, $7
; MIPS2-NEXT: or $5, $1, $5
-; MIPS2-NEXT: srlv $2, $6, $2
+; MIPS2-NEXT: srlv $2, $8, $2
; MIPS2-NEXT: jr $ra
; MIPS2-NEXT: addiu $sp, $sp, 32
;
@@ -451,52 +438,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS32: # %bb.0: # %entry
; MIPS32-NEXT: addiu $sp, $sp, -32
; MIPS32-NEXT: .cfi_def_cfa_offset 32
-; MIPS32-NEXT: swl $7, 28($sp)
-; MIPS32-NEXT: swl $6, 24($sp)
-; MIPS32-NEXT: swl $5, 20($sp)
-; MIPS32-NEXT: swl $4, 16($sp)
-; MIPS32-NEXT: swl $zero, 12($sp)
-; MIPS32-NEXT: swl $zero, 8($sp)
-; MIPS32-NEXT: swl $zero, 4($sp)
-; MIPS32-NEXT: swl $zero, 0($sp)
; MIPS32-NEXT: addiu $1, $sp, 0
-; MIPS32-NEXT: swr $7, 31($sp)
-; MIPS32-NEXT: swr $6, 27($sp)
-; MIPS32-NEXT: swr $5, 23($sp)
-; MIPS32-NEXT: swr $4, 19($sp)
-; MIPS32-NEXT: swr $zero, 15($sp)
-; MIPS32-NEXT: swr $zero, 11($sp)
-; MIPS32-NEXT: swr $zero, 7($sp)
-; MIPS32-NEXT: swr $zero, 3($sp)
+; MIPS32-NEXT: sw $7, 28($sp)
+; MIPS32-NEXT: sw $6, 24($sp)
+; MIPS32-NEXT: sw $5, 20($sp)
+; MIPS32-NEXT: sw $4, 16($sp)
; MIPS32-NEXT: addiu $1, $1, 16
; MIPS32-NEXT: lw $2, 60($sp)
; MIPS32-NEXT: srl $3, $2, 3
-; MIPS32-NEXT: andi $3, $3, 15
+; MIPS32-NEXT: andi $3, $3, 12
; MIPS32-NEXT: subu $1, $1, $3
-; MIPS32-NEXT: lwl $3, 4($1)
-; MIPS32-NEXT: lwr $3, 7($1)
-; MIPS32-NEXT: sll $4, $3, 1
-; MIPS32-NEXT: lwl $5, 8($1)
-; MIPS32-NEXT: lwr $5, 11($1)
-; MIPS32-NEXT: andi $2, $2, 7
-; MIPS32-NEXT: not $6, $2
-; MIPS32-NEXT: srlv $7, $5, $2
-; MIPS32-NEXT: sllv $4, $4, $6
+; MIPS32-NEXT: sw $zero, 12($sp)
+; MIPS32-NEXT: sw $zero, 8($sp)
+; MIPS32-NEXT: sw $zero, 4($sp)
+; MIPS32-NEXT: sw $zero, 0($sp)
+; MIPS32-NEXT: lw $3, 4($1)
+; MIPS32-NEXT: lw $5, 8($1)
+; MIPS32-NEXT: srlv $4, $5, $2
+; MIPS32-NEXT: sll $6, $3, 1
+; MIPS32-NEXT: andi $7, $2, 31
+; MIPS32-NEXT: xori $7, $7, 31
+; MIPS32-NEXT: sllv $6, $6, $7
; MIPS32-NEXT: srlv $3, $3, $2
-; MIPS32-NEXT: lwl $6, 0($1)
-; MIPS32-NEXT: lwr $6, 3($1)
-; MIPS32-NEXT: sll $8, $6, 1
-; MIPS32-NEXT: xori $9, $2, 31
-; MIPS32-NEXT: sllv $8, $8, $9
-; MIPS32-NEXT: or $3, $3, $8
-; MIPS32-NEXT: or $4, $7, $4
-; MIPS32-NEXT: lwl $7, 12($1)
-; MIPS32-NEXT: lwr $7, 15($1)
-; MIPS32-NEXT: srlv $1, $7, $2
+; MIPS32-NEXT: lw $8, 0($1)
+; MIPS32-NEXT: sll $9, $8, 1
+; MIPS32-NEXT: sllv $9, $9, $7
+; MIPS32-NEXT: or $3, $3, $9
+; MIPS32-NEXT: or $4, $4, $6
+; MIPS32-NEXT: lw $1, 12($1)
+; MIPS32-NEXT: srlv $1, $1, $2
; MIPS32-NEXT: sll $5, $5, 1
-; MIPS32-NEXT: sllv $5, $5, $9
+; MIPS32-NEXT: sllv $5, $5, $7
; MIPS32-NEXT: or $5, $1, $5
-; MIPS32-NEXT: srlv $2, $6, $2
+; MIPS32-NEXT: srlv $2, $8, $2
; MIPS32-NEXT: jr $ra
; MIPS32-NEXT: addiu $sp, $sp, 32
;
@@ -504,51 +478,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS32R2: # %bb.0: # %entry
; MIPS32R2-NEXT: addiu $sp, $sp, -32
; MIPS32R2-NEXT: .cfi_def_cfa_offset 32
-; MIPS32R2-NEXT: swl $7, 28($sp)
-; MIPS32R2-NEXT: swl $6, 24($sp)
-; MIPS32R2-NEXT: swl $5, 20($sp)
-; MIPS32R2-NEXT: swl $4, 16($sp)
-; MIPS32R2-NEXT: swl $zero, 12($sp)
-; MIPS32R2-NEXT: swl $zero, 8($sp)
-; MIPS32R2-NEXT: swl $zero, 4($sp)
-; MIPS32R2-NEXT: swl $zero, 0($sp)
-; MIPS32R2-NEXT: swr $7, 31($sp)
-; MIPS32R2-NEXT: swr $6, 27($sp)
-; MIPS32R2-NEXT: swr $5, 23($sp)
-; MIPS32R2-NEXT: swr $4, 19($sp)
-; MIPS32R2-NEXT: swr $zero, 15($sp)
-; MIPS32R2-NEXT: swr $zero, 11($sp)
-; MIPS32R2-NEXT: swr $zero, 7($sp)
-; MIPS32R2-NEXT: swr $zero, 3($sp)
; MIPS32R2-NEXT: addiu $1, $sp, 0
+; MIPS32R2-NEXT: sw $7, 28($sp)
+; MIPS32R2-NEXT: sw $6, 24($sp)
+; MIPS32R2-NEXT: sw $5, 20($sp)
+; MIPS32R2-NEXT: sw $4, 16($sp)
; MIPS32R2-NEXT: addiu $1, $1, 16
; MIPS32R2-NEXT: lw $2, 60($sp)
-; MIPS32R2-NEXT: ext $3, $2, 3, 4
+; MIPS32R2-NEXT: srl $3, $2, 3
+; MIPS32R2-NEXT: andi $3, $3, 12
; MIPS32R2-NEXT: subu $1, $1, $3
-; MIPS32R2-NEXT: lwl $3, 4($1)
-; MIPS32R2-NEXT: lwr $3, 7($1)
-; MIPS32R2-NEXT: sll $4, $3, 1
-; MIPS32R2-NEXT: lwl $5, 8($1)
-; MIPS32R2-NEXT: lwr $5, 11($1)
-; MIPS32R2-NEXT: andi $2, $2, 7
-; MIPS32R2-NEXT: not $6, $2
-; MIPS32R2-NEXT: srlv $7, $5, $2
-; MIPS32R2-NEXT: sllv $4, $4, $6
+; MIPS32R2-NEXT: sw $zero, 12($sp)
+; MIPS32R2-NEXT: sw $zero, 8($sp)
+; MIPS32R2-NEXT: sw $zero, 4($sp)
+; MIPS32R2-NEXT: sw $zero, 0($sp)
+; MIPS32R2-NEXT: lw $3, 4($1)
+; MIPS32R2-NEXT: lw $5, 8($1)
+; MIPS32R2-NEXT: srlv $4, $5, $2
+; MIPS32R2-NEXT: sll $6, $3, 1
+; MIPS32R2-NEXT: andi $7, $2, 31
+; MIPS32R2-NEXT: xori $7, $7, 31
+; MIPS32R2-NEXT: sllv $6, $6, $7
; MIPS32R2-NEXT: srlv $3, $3, $2
-; MIPS32R2-NEXT: lwl $6, 0($1)
-; MIPS32R2-NEXT: lwr $6, 3($1)
-; MIPS32R2-NEXT: sll $8, $6, 1
-; MIPS32R2-NEXT: xori $9, $2, 31
-; MIPS32R2-NEXT: sllv $8, $8, $9
-; MIPS32R2-NEXT: or $3, $3, $8
-; MIPS32R2-NEXT: or $4, $7, $4
-; MIPS32R2-NEXT: lwl $7, 12($1)
-; MIPS32R2-NEXT: lwr $7, 15($1)
-; MIPS32R2-NEXT: srlv $1, $7, $2
+; MIPS32R2-NEXT: lw $8, 0($1)
+; MIPS32R2-NEXT: sll $9, $8, 1
+; MIPS32R2-NEXT: sllv $9, $9, $7
+; MIPS32R2-NEXT: or $3, $3, $9
+; MIPS32R2-NEXT: or $4, $4, $6
+; MIPS32R2-NEXT: lw $1, 12($1)
+; MIPS32R2-NEXT: srlv $1, $1, $2
; MIPS32R2-NEXT: sll $5, $5, 1
-; MIPS32R2-NEXT: sllv $5, $5, $9
+; MIPS32R2-NEXT: sllv $5, $5, $7
; MIPS32R2-NEXT: or $5, $1, $5
-; MIPS32R2-NEXT: srlv $2, $6, $2
+; MIPS32R2-NEXT: srlv $2, $8, $2
; MIPS32R2-NEXT: jr $ra
; MIPS32R2-NEXT: addiu $sp, $sp, 32
;
@@ -563,32 +525,32 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS32R6-NEXT: sw $4, 16($sp)
; MIPS32R6-NEXT: addiu $1, $1, 16
; MIPS32R6-NEXT: lw $2, 60($sp)
-; MIPS32R6-NEXT: ext $3, $2, 3, 4
+; MIPS32R6-NEXT: srl $3, $2, 3
+; MIPS32R6-NEXT: andi $3, $3, 12
; MIPS32R6-NEXT: subu $1, $1, $3
; MIPS32R6-NEXT: sw $zero, 12($sp)
; MIPS32R6-NEXT: sw $zero, 8($sp)
; MIPS32R6-NEXT: sw $zero, 4($sp)
; MIPS32R6-NEXT: sw $zero, 0($sp)
; MIPS32R6-NEXT: lw $3, 4($1)
-; MIPS32R6-NEXT: sll $4, $3, 1
; MIPS32R6-NEXT: lw $5, 8($1)
-; MIPS32R6-NEXT: andi $2, $2, 7
-; MIPS32R6-NEXT: not $6, $2
-; MIPS32R6-NEXT: srlv $7, $5, $2
-; MIPS32R6-NEXT: sllv $4, $4, $6
+; MIPS32R6-NEXT: srlv $4, $5, $2
+; MIPS32R6-NEXT: sll $6, $3, 1
+; MIPS32R6-NEXT: andi $7, $2, 31
+; MIPS32R6-NEXT: xori $7, $7, 31
+; MIPS32R6-NEXT: sllv $6, $6, $7
; MIPS32R6-NEXT: srlv $3, $3, $2
-; MIPS32R6-NEXT: lw $6, 0($1)
-; MIPS32R6-NEXT: sll $8, $6, 1
-; MIPS32R6-NEXT: xori $9, $2, 31
-; MIPS32R6-NEXT: sllv $8, $8, $9
-; MIPS32R6-NEXT: or $3, $3, $8
-; MIPS32R6-NEXT: or $4, $7, $4
+; MIPS32R6-NEXT: lw $8, 0($1)
+; MIPS32R6-NEXT: sll $9, $8, 1
+; MIPS32R6-NEXT: sllv $9, $9, $7
+; MIPS32R6-NEXT: or $3, $3, $9
+; MIPS32R6-NEXT: or $4, $4, $6
; MIPS32R6-NEXT: lw $1, 12($1)
; MIPS32R6-NEXT: srlv $1, $1, $2
; MIPS32R6-NEXT: sll $5, $5, 1
-; MIPS32R6-NEXT: sllv $5, $5, $9
+; MIPS32R6-NEXT: sllv $5, $5, $7
; MIPS32R6-NEXT: or $5, $1, $5
-; MIPS32R6-NEXT: srlv $2, $6, $2
+; MIPS32R6-NEXT: srlv $2, $8, $2
; MIPS32R6-NEXT: jr $ra
; MIPS32R6-NEXT: addiu $sp, $sp, 32
;
@@ -677,53 +639,37 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MMR3-NEXT: swp $16, 32($sp)
; MMR3-NEXT: .cfi_offset 17, -4
; MMR3-NEXT: .cfi_offset 16, -8
-; MMR3-NEXT: swl $7, 28($sp)
-; MMR3-NEXT: swl $6, 24($sp)
-; MMR3-NEXT: swl $5, 20($sp)
; MMR3-NEXT: li16 $2, 0
-; MMR3-NEXT: swl $4, 16($sp)
-; MMR3-NEXT: swl $2, 12($sp)
-; MMR3-NEXT: swl $2, 8($sp)
-; MMR3-NEXT: swl $2, 4($sp)
-; MMR3-NEXT: swl $2, 0($sp)
-; MMR3-NEXT: swr $7, 31($sp)
-; MMR3-NEXT: swr $6, 27($sp)
-; MMR3-NEXT: swr $5, 23($sp)
-; MMR3-NEXT: swr $4, 19($sp)
-; MMR3-NEXT: swr $2, 15($sp)
-; MMR3-NEXT: swr $2, 11($sp)
-; MMR3-NEXT: swr $2, 7($sp)
-; MMR3-NEXT: swr $2, 3($sp)
+; MMR3-NEXT: swp $6, 24($sp)
+; MMR3-NEXT: swp $4, 16($sp)
+; MMR3-NEXT: sw $2, 12($sp)
+; MMR3-NEXT: sw $2, 8($sp)
+; MMR3-NEXT: sw $2, 4($sp)
+; MMR3-NEXT: sw $2, 0($sp)
; MMR3-NEXT: addiur1sp $2, 0
; MMR3-NEXT: addiur2 $2, $2, 16
; MMR3-NEXT: lw $3, 68($sp)
-; MMR3-NEXT: ext $4, $3, 3, 4
-; MMR3-NEXT: subu16 $2, $2, $4
-; MMR3-NEXT: lwl $7, 4($2)
-; MMR3-NEXT: lwr $7, 7($2)
-; MMR3-NEXT: sll16 $4, $7, 1
-; MMR3-NEXT: lwl $5, 8($2)
-; MMR3-NEXT: lwr $5, 11($2)
-; MMR3-NEXT: andi16 $6, $3, 7
-; MMR3-NEXT: not16 $3, $6
-; MMR3-NEXT: andi16 $3, $3, 31
-; MMR3-NEXT: srlv $16, $5, $6
-; MMR3-NEXT: sllv $4, $4, $3
-; MMR3-NEXT: srlv $17, $7, $6
-; MMR3-NEXT: lwl $7, 0($2)
-; MMR3-NEXT: lwr $7, 3($2)
-; MMR3-NEXT: sll16 $3, $7, 1
-; MMR3-NEXT: xori $1, $6, 31
+; MMR3-NEXT: srl16 $4, $3, 3
+; MMR3-NEXT: andi $4, $4, 12
+; MMR3-NEXT: subu16 $5, $2, $4
+; MMR3-NEXT: lwp $6, 4($5)
+; MMR3-NEXT: andi16 $2, $3, 31
+; MMR3-NEXT: srlv $16, $7, $2
+; MMR3-NEXT: sll16 $3, $6, 1
+; MMR3-NEXT: xori $1, $2, 31
+; MMR3-NEXT: sllv $4, $3, $1
+; MMR3-NEXT: srlv $6, $6, $2
+; MMR3-NEXT: lw16 $17, 0($5)
+; MMR3-NEXT: sll16 $3, $17, 1
; MMR3-NEXT: sllv $3, $3, $1
-; MMR3-NEXT: or16 $3, $17
+; MMR3-NEXT: or16 $3, $6
; MMR3-NEXT: or16 $4, $16
-; MMR3-NEXT: lwl $8, 12($2)
-; MMR3-NEXT: lwr $8, 15($2)
-; MMR3-NEXT: srlv $2, $8, $6
-; MMR3-NEXT: sll16 $5, $5, 1
+; MMR3-NEXT: lw16 $5, 12($5)
+; MMR3-NEXT: srlv $6, $5, $2
+; MMR3-NEXT: sll16 $5, $7, 1
; MMR3-NEXT: sllv $5, $5, $1
-; MMR3-NEXT: or16 $5, $2
-; MMR3-NEXT: srlv $2, $7, $6
+; MMR3-NEXT: or16 $5, $6
+; MMR3-NEXT: srlv $2, $17, $2
; MMR3-NEXT: lwp $16, 32($sp)
; MMR3-NEXT: addiusp 40
; MMR3-NEXT: jrc $ra
@@ -735,40 +681,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill
; MMR6-NEXT: .cfi_offset 16, -4
; MMR6-NEXT: li16 $2, 0
-; MMR6-NEXT: sw $7, 32($sp)
-; MMR6-NEXT: sw $6, 28($sp)
-; MMR6-NEXT: sw $5, 24($sp)
-; MMR6-NEXT: sw $4, 20($sp)
-; MMR6-NEXT: sw $2, 16($sp)
+; MMR6-NEXT: sw $7, 28($sp)
+; MMR6-NEXT: sw $6, 24($sp)
+; MMR6-NEXT: sw $5, 20($sp)
+; MMR6-NEXT: sw $4, 16($sp)
; MMR6-NEXT: sw $2, 12($sp)
; MMR6-NEXT: sw $2, 8($sp)
; MMR6-NEXT: sw $2, 4($sp)
-; MMR6-NEXT: addiu $2, $sp, 4
+; MMR6-NEXT: sw $2, 0($sp)
+; MMR6-NEXT: addiu $2, $sp, 0
; MMR6-NEXT: addiur2 $2, $2, 16
; MMR6-NEXT: lw $3, 68($sp)
-; MMR6-NEXT: ext $4, $3, 3, 4
-; MMR6-NEXT: subu16 $5, $2, $4
-; MMR6-NEXT: lw16 $4, 4($5)
-; MMR6-NEXT: sll16 $6, $4, 1
-; MMR6-NEXT: lw16 $7, 8($5)
-; MMR6-NEXT: andi16 $2, $3, 7
-; MMR6-NEXT: not16 $3, $2
-; MMR6-NEXT: andi16 $3, $3, 31
-; MMR6-NEXT: srlv $1, $7, $2
-; MMR6-NEXT: sllv $6, $6, $3
-; MMR6-NEXT: srlv $3, $4, $2
-; MMR6-NEXT: lw16 $16, 0($5)
+; MMR6-NEXT: srl16 $4, $3, 3
+; MMR6-NEXT: andi $4, $4, 12
+; MMR6-NEXT: subu16 $2, $2, $4
+; MMR6-NEXT: lw16 $4, 4($2)
+; MMR6-NEXT: lw16 $5, 8($2)
+; MMR6-NEXT: andi16 $6, $3, 31
+; MMR6-NEXT: srlv $1, $5, $6
+; MMR6-NEXT: sll16 $3, $4, 1
+; MMR6-NEXT: xori $7, $6, 31
+; MMR6-NEXT: sllv $8, $3, $7
+; MMR6-NEXT: srlv $3, $4, $6
+; MMR6-NEXT: lw16 $16, 0($2)
; MMR6-NEXT: sll16 $4, $16, 1
-; MMR6-NEXT: xori $8, $2, 31
-; MMR6-NEXT: sllv $4, $4, $8
+; MMR6-NEXT: sllv $4, $4, $7
; MMR6-NEXT: or $3, $3, $4
-; MMR6-NEXT: or $4, $1, $6
-; MMR6-NEXT: lw16 $5, 12($5)
-; MMR6-NEXT: srlv $1, $5, $2
-; MMR6-NEXT: sll16 $5, $7, 1
-; MMR6-NEXT: sllv $5, $5, $8
-; MMR6-NEXT: or $5, $1, $5
-; MMR6-NEXT: srlv $2, $16, $2
+; MMR6-NEXT: or $4, $1, $8
+; MMR6-NEXT: lw16 $2, 12($2)
+; MMR6-NEXT: srlv $1, $2, $6
+; MMR6-NEXT: sll16 $2, $5, 1
+; MMR6-NEXT: sllv $2, $2, $7
+; MMR6-NEXT: or $5, $1, $2
+; MMR6-NEXT: srlv $2, $16, $6
; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload
; MMR6-NEXT: addiu $sp, $sp, 40
; MMR6-NEXT: jrc $ra
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
index 81f089a5294708..394890a9dcc7c4 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -440,49 +440,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS2: # %bb.0: # %entry
; MIPS2-NEXT: addiu $sp, $sp, -32
; MIPS2-NEXT: .cfi_def_cfa_offset 32
-; MIPS2-NEXT: swl $zero, 28($sp)
-; MIPS2-NEXT: swl $zero, 24($sp)
-; MIPS2-NEXT: swl $zero, 20($sp)
-; MIPS2-NEXT: swl $zero, 16($sp)
-; MIPS2-NEXT: swl $7, 12($sp)
-; MIPS2-NEXT: swl $6, 8($sp)
-; MIPS2-NEXT: swl $5, 4($sp)
-; MIPS2-NEXT: swl $4, 0($sp)
-; MIPS2-NEXT: swr $zero, 31($sp)
-; MIPS2-NEXT: swr $zero, 27($sp)
-; MIPS2-NEXT: swr $zero, 23($sp)
-; MIPS2-NEXT: swr $zero, 19($sp)
-; MIPS2-NEXT: swr $7, 15($sp)
-; MIPS2-NEXT: swr $6, 11($sp)
-; MIPS2-NEXT: swr $5, 7($sp)
-; MIPS2-NEXT: swr $4, 3($sp)
; MIPS2-NEXT: lw $1, 60($sp)
; MIPS2-NEXT: srl $2, $1, 3
-; MIPS2-NEXT: andi $2, $2, 15
+; MIPS2-NEXT: sw $7, 12($sp)
+; MIPS2-NEXT: sw $6, 8($sp)
+; MIPS2-NEXT: sw $5, 4($sp)
+; MIPS2-NEXT: sw $4, 0($sp)
+; MIPS2-NEXT: andi $2, $2, 12
; MIPS2-NEXT: addiu $3, $sp, 0
; MIPS2-NEXT: addu $4, $3, $2
-; MIPS2-NEXT: lwl $5, 8($4)
-; MIPS2-NEXT: lwr $5, 11($4)
-; MIPS2-NEXT: srl $2, $5, 1
-; MIPS2-NEXT: lwl $3, 4($4)
-; MIPS2-NEXT: lwr $3, 7($4)
-; MIPS2-NEXT: andi $1, $1, 7
-; MIPS2-NEXT: not $6, $1
-; MIPS2-NEXT: sllv $7, $3, $1
-; MIPS2-NEXT: srlv $6, $2, $6
-; MIPS2-NEXT: lwl $2, 0($4)
-; MIPS2-NEXT: lwr $2, 3($4)
-; MIPS2-NEXT: sllv $2, $2, $1
-; MIPS2-NEXT: srl $3, $3, 1
-; MIPS2-NEXT: xori $8, $1, 31
-; MIPS2-NEXT: srlv $3, $3, $8
-; MIPS2-NEXT: or $2, $2, $3
-; MIPS2-NEXT: or $3, $7, $6
+; MIPS2-NEXT: sw $zero, 28($sp)
+; MIPS2-NEXT: sw $zero, 24($sp)
+; MIPS2-NEXT: sw $zero, 20($sp)
+; MIPS2-NEXT: sw $zero, 16($sp)
+; MIPS2-NEXT: lw $5, 8($4)
+; MIPS2-NEXT: lw $2, 4($4)
+; MIPS2-NEXT: sllv $3, $2, $1
+; MIPS2-NEXT: srl $6, $5, 1
+; MIPS2-NEXT: andi $7, $1, 31
+; MIPS2-NEXT: xori $7, $7, 31
+; MIPS2-NEXT: srlv $6, $6, $7
+; MIPS2-NEXT: lw $8, 0($4)
+; MIPS2-NEXT: sllv $8, $8, $1
+; MIPS2-NEXT: srl $2, $2, 1
+; MIPS2-NEXT: srlv $2, $2, $7
+; MIPS2-NEXT: or $2, $8, $2
+; MIPS2-NEXT: or $3, $3, $6
; MIPS2-NEXT: sllv $5, $5, $1
-; MIPS2-NEXT: lwl $6, 12($4)
-; MIPS2-NEXT: lwr $6, 15($4)
+; MIPS2-NEXT: lw $6, 12($4)
; MIPS2-NEXT: srl $4, $6, 1
-; MIPS2-NEXT: srlv $4, $4, $8
+; MIPS2-NEXT: srlv $4, $4, $7
; MIPS2-NEXT: or $4, $5, $4
; MIPS2-NEXT: sllv $5, $6, $1
; MIPS2-NEXT: jr $ra
@@ -492,49 +479,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32: # %bb.0: # %entry
; MIPS32-NEXT: addiu $sp, $sp, -32
; MIPS32-NEXT: .cfi_def_cfa_offset 32
-; MIPS32-NEXT: swl $zero, 28($sp)
-; MIPS32-NEXT: swl $zero, 24($sp)
-; MIPS32-NEXT: swl $zero, 20($sp)
-; MIPS32-NEXT: swl $zero, 16($sp)
-; MIPS32-NEXT: swl $7, 12($sp)
-; MIPS32-NEXT: swl $6, 8($sp)
-; MIPS32-NEXT: swl $5, 4($sp)
-; MIPS32-NEXT: swl $4, 0($sp)
-; MIPS32-NEXT: swr $zero, 31($sp)
-; MIPS32-NEXT: swr $zero, 27($sp)
-; MIPS32-NEXT: swr $zero, 23($sp)
-; MIPS32-NEXT: swr $zero, 19($sp)
-; MIPS32-NEXT: swr $7, 15($sp)
-; MIPS32-NEXT: swr $6, 11($sp)
-; MIPS32-NEXT: swr $5, 7($sp)
-; MIPS32-NEXT: swr $4, 3($sp)
; MIPS32-NEXT: lw $1, 60($sp)
; MIPS32-NEXT: srl $2, $1, 3
-; MIPS32-NEXT: andi $2, $2, 15
+; MIPS32-NEXT: sw $7, 12($sp)
+; MIPS32-NEXT: sw $6, 8($sp)
+; MIPS32-NEXT: sw $5, 4($sp)
+; MIPS32-NEXT: sw $4, 0($sp)
+; MIPS32-NEXT: andi $2, $2, 12
; MIPS32-NEXT: addiu $3, $sp, 0
; MIPS32-NEXT: addu $4, $3, $2
-; MIPS32-NEXT: lwl $5, 8($4)
-; MIPS32-NEXT: lwr $5, 11($4)
-; MIPS32-NEXT: srl $2, $5, 1
-; MIPS32-NEXT: lwl $3, 4($4)
-; MIPS32-NEXT: lwr $3, 7($4)
-; MIPS32-NEXT: andi $1, $1, 7
-; MIPS32-NEXT: not $6, $1
-; MIPS32-NEXT: sllv $7, $3, $1
-; MIPS32-NEXT: srlv $6, $2, $6
-; MIPS32-NEXT: lwl $2, 0($4)
-; MIPS32-NEXT: lwr $2, 3($4)
-; MIPS32-NEXT: sllv $2, $2, $1
-; MIPS32-NEXT: srl $3, $3, 1
-; MIPS32-NEXT: xori $8, $1, 31
-; MIPS32-NEXT: srlv $3, $3, $8
-; MIPS32-NEXT: or $2, $2, $3
-; MIPS32-NEXT: or $3, $7, $6
+; MIPS32-NEXT: sw $zero, 28($sp)
+; MIPS32-NEXT: sw $zero, 24($sp)
+; MIPS32-NEXT: sw $zero, 20($sp)
+; MIPS32-NEXT: sw $zero, 16($sp)
+; MIPS32-NEXT: lw $5, 8($4)
+; MIPS32-NEXT: lw $2, 4($4)
+; MIPS32-NEXT: sllv $3, $2, $1
+; MIPS32-NEXT: srl $6, $5, 1
+; MIPS32-NEXT: andi $7, $1, 31
+; MIPS32-NEXT: xori $7, $7, 31
+; MIPS32-NEXT: srlv $6, $6, $7
+; MIPS32-NEXT: lw $8, 0($4)
+; MIPS32-NEXT: sllv $8, $8, $1
+; MIPS32-NEXT: srl $2, $2, 1
+; MIPS32-NEXT: srlv $2, $2, $7
+; MIPS32-NEXT: or $2, $8, $2
+; MIPS32-NEXT: or $3, $3, $6
; MIPS32-NEXT: sllv $5, $5, $1
-; MIPS32-NEXT: lwl $6, 12($4)
-; MIPS32-NEXT: lwr $6, 15($4)
+; MIPS32-NEXT: lw $6, 12($4)
; MIPS32-NEXT: srl $4, $6, 1
-; MIPS32-NEXT: srlv $4, $4, $8
+; MIPS32-NEXT: srlv $4, $4, $7
; MIPS32-NEXT: or $4, $5, $4
; MIPS32-NEXT: sllv $5, $6, $1
; MIPS32-NEXT: jr $ra
@@ -544,48 +518,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32R2: # %bb.0: # %entry
; MIPS32R2-NEXT: addiu $sp, $sp, -32
; MIPS32R2-NEXT: .cfi_def_cfa_offset 32
-; MIPS32R2-NEXT: swl $zero, 28($sp)
-; MIPS32R2-NEXT: swl $zero, 24($sp)
-; MIPS32R2-NEXT: swl $zero, 20($sp)
-; MIPS32R2-NEXT: swl $zero, 16($sp)
-; MIPS32R2-NEXT: swl $7, 12($sp)
-; MIPS32R2-NEXT: swl $6, 8($sp)
-; MIPS32R2-NEXT: swl $5, 4($sp)
-; MIPS32R2-NEXT: swl $4, 0($sp)
-; MIPS32R2-NEXT: swr $zero, 31($sp)
-; MIPS32R2-NEXT: swr $zero, 27($sp)
-; MIPS32R2-NEXT: swr $zero, 23($sp)
-; MIPS32R2-NEXT: swr $zero, 19($sp)
-; MIPS32R2-NEXT: swr $7, 15($sp)
-; MIPS32R2-NEXT: swr $6, 11($sp)
-; MIPS32R2-NEXT: swr $5, 7($sp)
-; MIPS32R2-NEXT: swr $4, 3($sp)
; MIPS32R2-NEXT: lw $1, 60($sp)
-; MIPS32R2-NEXT: ext $2, $1, 3, 4
+; MIPS32R2-NEXT: srl $2, $1, 3
+; MIPS32R2-NEXT: sw $7, 12($sp)
+; MIPS32R2-NEXT: sw $6, 8($sp)
+; MIPS32R2-NEXT: sw $5, 4($sp)
+; MIPS32R2-NEXT: sw $4, 0($sp)
+; MIPS32R2-NEXT: andi $2, $2, 12
; MIPS32R2-NEXT: addiu $3, $sp, 0
; MIPS32R2-NEXT: addu $4, $3, $2
-; MIPS32R2-NEXT: lwl $5, 8($4)
-; MIPS32R2-NEXT: lwr $5, 11($4)
-; MIPS32R2-NEXT: srl $2, $5, 1
-; MIPS32R2-NEXT: lwl $3, 4($4)
-; MIPS32R2-NEXT: lwr $3, 7($4)
-; MIPS32R2-NEXT: andi $1, $1, 7
-; MIPS32R2-NEXT: not $6, $1
-; MIPS32R2-NEXT: sllv $7, $3, $1
-; MIPS32R2-NEXT: srlv $6, $2, $6
-; MIPS32R2-NEXT: lwl $2, 0($4)
-; MIPS32R2-NEXT: lwr $2, 3($4)
-; MIPS32R2-NEXT: sllv $2, $2, $1
-; MIPS32R2-NEXT: srl $3, $3, 1
-; MIPS32R2-NEXT: xori $8, $1, 31
-; MIPS32R2-NEXT: srlv $3, $3, $8
-; MIPS32R2-NEXT: or $2, $2, $3
-; MIPS32R2-NEXT: or $3, $7, $6
+; MIPS32R2-NEXT: sw $zero, 28($sp)
+; MIPS32R2-NEXT: sw $zero, 24($sp)
+; MIPS32R2-NEXT: sw $zero, 20($sp)
+; MIPS32R2-NEXT: sw $zero, 16($sp)
+; MIPS32R2-NEXT: lw $5, 8($4)
+; MIPS32R2-NEXT: lw $2, 4($4)
+; MIPS32R2-NEXT: sllv $3, $2, $1
+; MIPS32R2-NEXT: srl $6, $5, 1
+; MIPS32R2-NEXT: andi $7, $1, 31
+; MIPS32R2-NEXT: xori $7, $7, 31
+; MIPS32R2-NEXT: srlv $6, $6, $7
+; MIPS32R2-NEXT: lw $8, 0($4)
+; MIPS32R2-NEXT: sllv $8, $8, $1
+; MIPS32R2-NEXT: srl $2, $2, 1
+; MIPS32R2-NEXT: srlv $2, $2, $7
+; MIPS32R2-NEXT: or $2, $8, $2
+; MIPS32R2-NEXT: or $3, $3, $6
; MIPS32R2-NEXT: sllv $5, $5, $1
-; MIPS32R2-NEXT: lwl $6, 12($4)
-; MIPS32R2-NEXT: lwr $6, 15($4)
+; MIPS32R2-NEXT: lw $6, 12($4)
; MIPS32R2-NEXT: srl $4, $6, 1
-; MIPS32R2-NEXT: srlv $4, $4, $8
+; MIPS32R2-NEXT: srlv $4, $4, $7
; MIPS32R2-NEXT: or $4, $5, $4
; MIPS32R2-NEXT: sllv $5, $6, $1
; MIPS32R2-NEXT: jr $ra
@@ -596,11 +558,12 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32R6-NEXT: addiu $sp, $sp, -32
; MIPS32R6-NEXT: .cfi_def_cfa_offset 32
; MIPS32R6-NEXT: lw $1, 60($sp)
+; MIPS32R6-NEXT: srl $2, $1, 3
; MIPS32R6-NEXT: sw $7, 12($sp)
; MIPS32R6-NEXT: sw $6, 8($sp)
; MIPS32R6-NEXT: sw $5, 4($sp)
; MIPS32R6-NEXT: sw $4, 0($sp)
-; MIPS32R6-NEXT: ext $2, $1, 3, 4
+; MIPS32R6-NEXT: andi $2, $2, 12
; MIPS32R6-NEXT: addiu $3, $sp, 0
; MIPS32R6-NEXT: addu $4, $3, $2
; MIPS32R6-NEXT: sw $zero, 28($sp)
@@ -608,23 +571,22 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32R6-NEXT: sw $zero, 20($sp)
; MIPS32R6-NEXT: sw $zero, 16($sp)
; MIPS32R6-NEXT: lw $5, 8($4)
-; MIPS32R6-NEXT: srl $2, $5, 1
-; MIPS32R6-NEXT: lw $3, 4($4)
-; MIPS32R6-NEXT: andi $1, $1, 7
-; MIPS32R6-NEXT: not $6, $1
-; MIPS32R6-NEXT: sllv $7, $3, $1
-; MIPS32R6-NEXT: srlv $6, $2, $6
-; MIPS32R6-NEXT: lw $2, 0($4)
-; MIPS32R6-NEXT: sllv $2, $2, $1
-; MIPS32R6-NEXT: srl $3, $3, 1
-; MIPS32R6-NEXT: xori $8, $1, 31
-; MIPS32R6-NEXT: srlv $3, $3, $8
-; MIPS32R6-NEXT: or $2, $2, $3
-; MIPS32R6-NEXT: or $3, $7, $6
+; MIPS32R6-NEXT: lw $2, 4($4)
+; MIPS32R6-NEXT: sllv $3, $2, $1
+; MIPS32R6-NEXT: srl $6, $5, 1
+; MIPS32R6-NEXT: andi $7, $1, 31
+; MIPS32R6-NEXT: xori $7, $7, 31
+; MIPS32R6-NEXT: srlv $6, $6, $7
+; MIPS32R6-NEXT: lw $8, 0($4)
+; MIPS32R6-NEXT: sllv $8, $8, $1
+; MIPS32R6-NEXT: srl $2, $2, 1
+; MIPS32R6-NEXT: srlv $2, $2, $7
+; MIPS32R6-NEXT: or $2, $8, $2
+; MIPS32R6-NEXT: or $3, $3, $6
; MIPS32R6-NEXT: sllv $5, $5, $1
; MIPS32R6-NEXT: lw $6, 12($4)
; MIPS32R6-NEXT: srl $4, $6, 1
-; MIPS32R6-NEXT: srlv $4, $4, $8
+; MIPS32R6-NEXT: srlv $4, $4, $7
; MIPS32R6-NEXT: or $4, $5, $4
; MIPS32R6-NEXT: sllv $5, $6, $1
; MIPS32R6-NEXT: jr $ra
@@ -722,47 +684,32 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MMR3-NEXT: .cfi_offset 17, -4
; MMR3-NEXT: .cfi_offset 16, -8
; MMR3-NEXT: li16 $2, 0
-; MMR3-NEXT: swl $2, 28($sp)
-; MMR3-NEXT: swl $2, 24($sp)
-; MMR3-NEXT: swl $2, 20($sp)
-; MMR3-NEXT: swl $2, 16($sp)
-; MMR3-NEXT: swl $7, 12($sp)
-; MMR3-NEXT: swl $6, 8($sp)
-; MMR3-NEXT: swl $5, 4($sp)
-; MMR3-NEXT: swl $4, 0($sp)
-; MMR3-NEXT: swr $2, 31($sp)
-; MMR3-NEXT: swr $2, 27($sp)
-; MMR3-NEXT: swr $2, 23($sp)
-; MMR3-NEXT: swr $2, 19($sp)
-; MMR3-NEXT: swr $7, 15($sp)
-; MMR3-NEXT: swr $6, 11($sp)
-; MMR3-NEXT: swr $5, 7($sp)
-; MMR3-NEXT: swr $4, 3($sp)
+; MMR3-NEXT: sw $2, 28($sp)
+; MMR3-NEXT: sw $2, 24($sp)
+; MMR3-NEXT: sw $2, 20($sp)
+; MMR3-NEXT: sw $2, 16($sp)
+; MMR3-NEXT: swp $6, 8($sp)
+; MMR3-NEXT: swp $4, 0($sp)
; MMR3-NEXT: lw $2, 68($sp)
-; MMR3-NEXT: ext $3, $2, 3, 4
+; MMR3-NEXT: srl16 $3, $2, 3
+; MMR3-NEXT: andi $3, $3, 12
; MMR3-NEXT: addiur1sp $4, 0
; MMR3-NEXT: addu16 $4, $4, $3
-; MMR3-NEXT: lwl $6, 8($4)
-; MMR3-NEXT: lwr $6, 11($4)
-; MMR3-NEXT: srl16 $3, $6, 1
-; MMR3-NEXT: lwl $7, 4($4)
-; MMR3-NEXT: lwr $7, 7($4)
-; MMR3-NEXT: andi16 $5, $2, 7
-; MMR3-NEXT: not16 $2, $5
-; MMR3-NEXT: andi16 $2, $2, 31
+; MMR3-NEXT: lw16 $6, 8($4)
+; MMR3-NEXT: lw16 $7, 4($4)
+; MMR3-NEXT: andi16 $5, $2, 31
; MMR3-NEXT: sllv $16, $7, $5
-; MMR3-NEXT: srlv $3, $3, $2
-; MMR3-NEXT: lwl $1, 0($4)
-; MMR3-NEXT: lwr $1, 3($4)
-; MMR3-NEXT: sllv $17, $1, $5
-; MMR3-NEXT: srl16 $2, $7, 1
+; MMR3-NEXT: srl16 $2, $6, 1
; MMR3-NEXT: xori $1, $5, 31
+; MMR3-NEXT: srlv $3, $2, $1
+; MMR3-NEXT: lw16 $2, 0($4)
+; MMR3-NEXT: sllv $17, $2, $5
+; MMR3-NEXT: srl16 $2, $7, 1
; MMR3-NEXT: srlv $2, $2, $1
; MMR3-NEXT: or16 $2, $17
; MMR3-NEXT: or16 $3, $16
; MMR3-NEXT: sllv $6, $6, $5
-; MMR3-NEXT: lwl $7, 12($4)
-; MMR3-NEXT: lwr $7, 15($4)
+; MMR3-NEXT: lw16 $7, 12($4)
; MMR3-NEXT: srl16 $4, $7, 1
; MMR3-NEXT: srlv $4, $4, $1
; MMR3-NEXT: or16 $4, $6
@@ -785,30 +732,29 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MMR6-NEXT: sw $5, 4($sp)
; MMR6-NEXT: sw $4, 0($sp)
; MMR6-NEXT: lw $2, 60($sp)
-; MMR6-NEXT: ext $3, $2, 3, 4
+; MMR6-NEXT: srl16 $3, $2, 3
+; MMR6-NEXT: andi $3, $3, 12
; MMR6-NEXT: addiu $4, $sp, 0
; MMR6-NEXT: addu16 $4, $4, $3
-; MMR6-NEXT: lw16 $6, 8($4)
-; MMR6-NEXT: srl16 $3, $6, 1
-; MMR6-NEXT: lw16 $7, 4($4)
-; MMR6-NEXT: andi16 $5, $2, 7
-; MMR6-NEXT: not16 $2, $5
-; MMR6-NEXT: andi16 $2, $2, 31
-; MMR6-NEXT: sllv $1, $7, $5
-; MMR6-NEXT: srlv $3, $3, $2
+; MMR6-NEXT: lw16 $5, 8($4)
+; MMR6-NEXT: lw16 $3, 4($4)
+; MMR6-NEXT: andi16 $6, $2, 31
+; MMR6-NEXT: sllv $1, $3, $6
+; MMR6-NEXT: srl16 $2, $5, 1
+; MMR6-NEXT: xori $7, $6, 31
+; MMR6-NEXT: srlv $8, $2, $7
; MMR6-NEXT: lw16 $2, 0($4)
-; MMR6-NEXT: sllv $2, $2, $5
-; MMR6-NEXT: srl16 $7, $7, 1
-; MMR6-NEXT: xori $8, $5, 31
-; MMR6-NEXT: srlv $7, $7, $8
-; MMR6-NEXT: or $2, $2, $7
-; MMR6-NEXT: or $3, $1, $3
-; MMR6-NEXT: sllv $1, $6, $5
-; MMR6-NEXT: lw16 $6, 12($4)
-; MMR6-NEXT: srl16 $4, $6, 1
-; MMR6-NEXT: srlv $4, $4, $8
+; MMR6-NEXT: sllv $2, $2, $6
+; MMR6-NEXT: srl16 $3, $3, 1
+; MMR6-NEXT: srlv $3, $3, $7
+; MMR6-NEXT: or $2, $2, $3
+; MMR6-NEXT: or $3, $1, $8
+; MMR6-NEXT: sllv $1, $5, $6
+; MMR6-NEXT: lw16 $5, 12($4)
+; MMR6-NEXT: srl16 $4, $5, 1
+; MMR6-NEXT: srlv $4, $4, $7
; MMR6-NEXT: or $4, $1, $4
-; MMR6-NEXT: sllv $5, $6, $5
+; MMR6-NEXT: sllv $5, $5, $6
; MMR6-NEXT: addiu $sp, $sp, 32
; MMR6-NEXT: jrc $ra
entry:
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
index c48361e0a8035c..72de456cba395b 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
@@ -8,58 +8,52 @@ define void @foo1(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
; CHECK-LABEL: foo1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: stwu 1, -64(1)
-; CHECK-NEXT: stw 28, 48(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 8, 2048
; CHECK-NEXT: stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 6, 0
+; CHECK-NEXT: li 7, 2048
; CHECK-NEXT: stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 7, 7
-; CHECK-NEXT: mtctr 8
-; CHECK-NEXT: addi 8, 1, 16
+; CHECK-NEXT: li 6, 0
+; CHECK-NEXT: mtctr 7
+; CHECK-NEXT: addi 7, 1, 16
; CHECK-NEXT: .LBB0_1: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: lwz 9, 0(4)
-; CHECK-NEXT: lwz 10, 4(4)
-; CHECK-NEXT: lwz 11, 8(4)
-; CHECK-NEXT: lwz 12, 12(4)
-; CHECK-NEXT: lwz 0, 12(5)
+; CHECK-NEXT: lwz 8, 0(4)
+; CHECK-NEXT: lwz 9, 4(4)
+; CHECK-NEXT: lwz 10, 8(4)
+; CHECK-NEXT: lwz 11, 12(4)
+; CHECK-NEXT: lwz 12, 12(5)
; CHECK-NEXT: stw 6, 44(1)
; CHECK-NEXT: stw 6, 40(1)
; CHECK-NEXT: stw 6, 36(1)
; CHECK-NEXT: stw 6, 32(1)
-; CHECK-NEXT: stw 12, 28(1)
-; CHECK-NEXT: clrlwi 12, 0, 29
-; CHECK-NEXT: stw 11, 24(1)
-; CHECK-NEXT: nand 11, 0, 7
-; CHECK-NEXT: stw 10, 20(1)
-; CHECK-NEXT: subfic 29, 12, 32
-; CHECK-NEXT: stw 9, 16(1)
-; CHECK-NEXT: rlwinm 9, 0, 29, 28, 31
-; CHECK-NEXT: lwzux 10, 9, 8
-; CHECK-NEXT: clrlwi 11, 11, 27
-; CHECK-NEXT: lwz 0, 8(9)
-; CHECK-NEXT: slw 10, 10, 12
-; CHECK-NEXT: lwz 30, 4(9)
-; CHECK-NEXT: lwz 9, 12(9)
-; CHECK-NEXT: slw 28, 30, 12
-; CHECK-NEXT: srw 30, 30, 29
-; CHECK-NEXT: srw 29, 9, 29
-; CHECK-NEXT: slw 9, 9, 12
-; CHECK-NEXT: slw 12, 0, 12
-; CHECK-NEXT: srwi 0, 0, 1
-; CHECK-NEXT: stw 9, 12(3)
-; CHECK-NEXT: or 9, 12, 29
-; CHECK-NEXT: srw 11, 0, 11
-; CHECK-NEXT: stw 9, 8(3)
-; CHECK-NEXT: or 9, 10, 30
-; CHECK-NEXT: stw 9, 0(3)
-; CHECK-NEXT: or 9, 28, 11
-; CHECK-NEXT: stw 9, 4(3)
+; CHECK-NEXT: stw 11, 28(1)
+; CHECK-NEXT: stw 10, 24(1)
+; CHECK-NEXT: clrlwi 10, 12, 27
+; CHECK-NEXT: stw 9, 20(1)
+; CHECK-NEXT: stw 8, 16(1)
+; CHECK-NEXT: rlwinm 8, 12, 29, 28, 29
+; CHECK-NEXT: lwzux 9, 8, 7
+; CHECK-NEXT: subfic 12, 10, 32
+; CHECK-NEXT: lwz 11, 8(8)
+; CHECK-NEXT: slw 9, 9, 10
+; CHECK-NEXT: lwz 0, 4(8)
+; CHECK-NEXT: lwz 8, 12(8)
+; CHECK-NEXT: srw 30, 11, 12
+; CHECK-NEXT: slw 29, 0, 10
+; CHECK-NEXT: srw 0, 0, 12
+; CHECK-NEXT: srw 12, 8, 12
+; CHECK-NEXT: slw 11, 11, 10
+; CHECK-NEXT: slw 8, 8, 10
+; CHECK-NEXT: stw 8, 12(3)
+; CHECK-NEXT: or 8, 11, 12
+; CHECK-NEXT: stw 8, 8(3)
+; CHECK-NEXT: or 8, 9, 0
+; CHECK-NEXT: stw 8, 0(3)
+; CHECK-NEXT: or 8, 29, 30
+; CHECK-NEXT: stw 8, 4(3)
; CHECK-NEXT: bdnz .LBB0_1
; CHECK-NEXT: # %bb.2: # %for.end
; CHECK-NEXT: lwz 30, 56(1) # 4-byte Folded Reload
; CHECK-NEXT: lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT: lwz 28, 48(1) # 4-byte Folded Reload
; CHECK-NEXT: addi 1, 1, 64
; CHECK-NEXT: blr
entry:
@@ -83,59 +77,53 @@ for.end: ; preds = %for.body
define void @foo2(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
; CHECK-LABEL: foo2:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stwu 1, -64(1)
-; CHECK-NEXT: stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 7, 2048
-; CHECK-NEXT: stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 6, 7
-; CHECK-NEXT: mtctr 7
-; CHECK-NEXT: addi 7, 1, 36
+; CHECK-NEXT: stwu 1, -48(1)
+; CHECK-NEXT: stw 30, 40(1) # 4-byte Folded Spill
+; CHECK-NEXT: li 6, 2048
+; CHECK-NEXT: mtctr 6
+; CHECK-NEXT: addi 6, 1, 24
; CHECK-NEXT: .LBB1_1: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: lwz 8, 0(4)
-; CHECK-NEXT: lwz 10, 8(4)
-; CHECK-NEXT: lwz 12, 12(5)
-; CHECK-NEXT: lwz 9, 4(4)
-; CHECK-NEXT: lwz 11, 12(4)
-; CHECK-NEXT: stw 10, 44(1)
-; CHECK-NEXT: rlwinm 10, 12, 29, 28, 31
-; CHECK-NEXT: stw 8, 36(1)
-; CHECK-NEXT: srawi 8, 8, 31
-; CHECK-NEXT: stw 11, 48(1)
-; CHECK-NEXT: clrlwi 11, 12, 29
-; CHECK-NEXT: stw 9, 40(1)
-; CHECK-NEXT: nand 9, 12, 6
-; CHECK-NEXT: stw 8, 32(1)
-; CHECK-NEXT: subfic 30, 11, 32
+; CHECK-NEXT: lwz 7, 0(4)
+; CHECK-NEXT: lwz 8, 4(4)
+; CHECK-NEXT: lwz 11, 12(5)
+; CHECK-NEXT: lwz 9, 8(4)
+; CHECK-NEXT: lwz 10, 12(4)
; CHECK-NEXT: stw 8, 28(1)
-; CHECK-NEXT: clrlwi 9, 9, 27
-; CHECK-NEXT: stw 8, 24(1)
-; CHECK-NEXT: stw 8, 20(1)
-; CHECK-NEXT: sub 8, 7, 10
-; CHECK-NEXT: lwz 10, 4(8)
-; CHECK-NEXT: lwz 12, 8(8)
-; CHECK-NEXT: lwz 0, 0(8)
-; CHECK-NEXT: lwz 8, 12(8)
-; CHECK-NEXT: srw 29, 12, 11
-; CHECK-NEXT: slw 12, 12, 30
-; CHECK-NEXT: slw 30, 0, 30
-; CHECK-NEXT: srw 8, 8, 11
-; CHECK-NEXT: sraw 0, 0, 11
-; CHECK-NEXT: srw 11, 10, 11
-; CHECK-NEXT: slwi 10, 10, 1
-; CHECK-NEXT: or 8, 12, 8
-; CHECK-NEXT: slw 9, 10, 9
-; CHECK-NEXT: stw 8, 12(3)
-; CHECK-NEXT: or 8, 30, 11
-; CHECK-NEXT: stw 8, 4(3)
-; CHECK-NEXT: or 8, 29, 9
-; CHECK-NEXT: stw 0, 0(3)
-; CHECK-NEXT: stw 8, 8(3)
+; CHECK-NEXT: rlwinm 8, 11, 29, 28, 29
+; CHECK-NEXT: stw 7, 24(1)
+; CHECK-NEXT: srawi 7, 7, 31
+; CHECK-NEXT: stw 10, 36(1)
+; CHECK-NEXT: clrlwi 10, 11, 27
+; CHECK-NEXT: stw 9, 32(1)
+; CHECK-NEXT: subfic 12, 10, 32
+; CHECK-NEXT: stw 7, 20(1)
+; CHECK-NEXT: stw 7, 16(1)
+; CHECK-NEXT: stw 7, 12(1)
+; CHECK-NEXT: stw 7, 8(1)
+; CHECK-NEXT: sub 7, 6, 8
+; CHECK-NEXT: lwz 8, 4(7)
+; CHECK-NEXT: lwz 9, 0(7)
+; CHECK-NEXT: lwz 11, 12(7)
+; CHECK-NEXT: srw 0, 8, 10
+; CHECK-NEXT: lwz 7, 8(7)
+; CHECK-NEXT: slw 30, 9, 12
+; CHECK-NEXT: slw 8, 8, 12
+; CHECK-NEXT: srw 11, 11, 10
+; CHECK-NEXT: slw 12, 7, 12
+; CHECK-NEXT: srw 7, 7, 10
+; CHECK-NEXT: or 7, 8, 7
+; CHECK-NEXT: stw 7, 8(3)
+; CHECK-NEXT: or 7, 12, 11
+; CHECK-NEXT: sraw 9, 9, 10
+; CHECK-NEXT: stw 7, 12(3)
+; CHECK-NEXT: or 7, 30, 0
+; CHECK-NEXT: stw 9, 0(3)
+; CHECK-NEXT: stw 7, 4(3)
; CHECK-NEXT: bdnz .LBB1_1
; CHECK-NEXT: # %bb.2: # %for.end
-; CHECK-NEXT: lwz 30, 56(1) # 4-byte Folded Reload
-; CHECK-NEXT: lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT: addi 1, 1, 64
+; CHECK-NEXT: lwz 30, 40(1) # 4-byte Folded Reload
+; CHECK-NEXT: addi 1, 1, 48
; CHECK-NEXT: blr
entry:
br label %for.body
@@ -159,59 +147,53 @@ define void @foo3(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
; CHECK-LABEL: foo3:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: stwu 1, -64(1)
-; CHECK-NEXT: stw 28, 48(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 8, 2048
; CHECK-NEXT: stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 6, 0
+; CHECK-NEXT: li 7, 2048
; CHECK-NEXT: stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 7, 7
-; CHECK-NEXT: mtctr 8
-; CHECK-NEXT: addi 8, 1, 32
+; CHECK-NEXT: li 6, 0
+; CHECK-NEXT: mtctr 7
+; CHECK-NEXT: addi 7, 1, 32
; CHECK-NEXT: .LBB2_1: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: lwz 10, 4(4)
-; CHECK-NEXT: lwz 0, 12(5)
-; CHECK-NEXT: lwz 9, 0(4)
-; CHECK-NEXT: lwz 11, 8(4)
-; CHECK-NEXT: lwz 12, 12(4)
-; CHECK-NEXT: stw 10, 36(1)
-; CHECK-NEXT: rlwinm 10, 0, 29, 28, 31
+; CHECK-NEXT: lwz 8, 0(4)
+; CHECK-NEXT: lwz 12, 12(5)
+; CHECK-NEXT: lwz 9, 4(4)
+; CHECK-NEXT: lwz 10, 8(4)
+; CHECK-NEXT: lwz 11, 12(4)
+; CHECK-NEXT: stw 8, 32(1)
+; CHECK-NEXT: rlwinm 8, 12, 29, 28, 29
; CHECK-NEXT: stw 6, 28(1)
-; CHECK-NEXT: sub 10, 8, 10
+; CHECK-NEXT: sub 8, 7, 8
; CHECK-NEXT: stw 6, 24(1)
; CHECK-NEXT: stw 6, 20(1)
; CHECK-NEXT: stw 6, 16(1)
-; CHECK-NEXT: stw 12, 44(1)
-; CHECK-NEXT: clrlwi 12, 0, 29
-; CHECK-NEXT: stw 11, 40(1)
-; CHECK-NEXT: subfic 29, 12, 32
-; CHECK-NEXT: stw 9, 32(1)
-; CHECK-NEXT: nand 9, 0, 7
-; CHECK-NEXT: lwz 11, 4(10)
-; CHECK-NEXT: clrlwi 9, 9, 27
-; CHECK-NEXT: lwz 0, 8(10)
-; CHECK-NEXT: lwz 30, 0(10)
-; CHECK-NEXT: lwz 10, 12(10)
-; CHECK-NEXT: srw 28, 0, 12
-; CHECK-NEXT: slw 0, 0, 29
-; CHECK-NEXT: slw 29, 30, 29
-; CHECK-NEXT: srw 10, 10, 12
-; CHECK-NEXT: srw 30, 30, 12
-; CHECK-NEXT: srw 12, 11, 12
-; CHECK-NEXT: slwi 11, 11, 1
-; CHECK-NEXT: slw 9, 11, 9
-; CHECK-NEXT: or 10, 0, 10
-; CHECK-NEXT: stw 10, 12(3)
-; CHECK-NEXT: or 10, 29, 12
-; CHECK-NEXT: or 9, 28, 9
-; CHECK-NEXT: stw 30, 0(3)
-; CHECK-NEXT: stw 10, 4(3)
-; CHECK-NEXT: stw 9, 8(3)
+; CHECK-NEXT: stw 11, 44(1)
+; CHECK-NEXT: clrlwi 11, 12, 27
+; CHECK-NEXT: stw 10, 40(1)
+; CHECK-NEXT: subfic 0, 11, 32
+; CHECK-NEXT: stw 9, 36(1)
+; CHECK-NEXT: lwz 9, 4(8)
+; CHECK-NEXT: lwz 10, 0(8)
+; CHECK-NEXT: lwz 12, 12(8)
+; CHECK-NEXT: srw 30, 9, 11
+; CHECK-NEXT: lwz 8, 8(8)
+; CHECK-NEXT: slw 29, 10, 0
+; CHECK-NEXT: slw 9, 9, 0
+; CHECK-NEXT: srw 12, 12, 11
+; CHECK-NEXT: slw 0, 8, 0
+; CHECK-NEXT: srw 8, 8, 11
+; CHECK-NEXT: or 8, 9, 8
+; CHECK-NEXT: stw 8, 8(3)
+; CHECK-NEXT: or 8, 0, 12
+; CHECK-NEXT: srw 10, 10, 11
+; CHECK-NEXT: stw 8, 12(3)
+; CHECK-NEXT: or 8, 29, 30
+; CHECK-NEXT: stw 10, 0(3)
+; CHECK-NEXT: stw 8, 4(3)
; CHECK-NEXT: bdnz .LBB2_1
; CHECK-NEXT: # %bb.2: # %for.end
; CHECK-NEXT: lwz 30, 56(1) # 4-byte Folded Reload
; CHECK-NEXT: lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT: lwz 28, 48(1) # 4-byte Folded Reload
; CHECK-NEXT: addi 1, 1, 64
; CHECK-NEXT: blr
entry:
diff --git a/llvm/test/CodeGen/PowerPC/pr59074.ll b/llvm/test/CodeGen/PowerPC/pr59074.ll
index 3e328c6ad9f0ba..d3ca1139b4fd11 100644
--- a/llvm/test/CodeGen/PowerPC/pr59074.ll
+++ b/llvm/test/CodeGen/PowerPC/pr59074.ll
@@ -32,37 +32,36 @@ define void @pr59074(ptr %0) {
; LE32-NEXT: li 7, 0
; LE32-NEXT: li 8, 12
; LE32-NEXT: xxswapd 0, 0
+; LE32-NEXT: rlwimi 5, 6, 0, 30, 28
; LE32-NEXT: addi 4, 4, -12
-; LE32-NEXT: rlwinm 9, 4, 29, 28, 31
-; LE32-NEXT: stxvd2x 0, 6, 5
+; LE32-NEXT: rlwinm 9, 4, 29, 28, 29
+; LE32-NEXT: stxvd2x 0, 0, 5
; LE32-NEXT: stw 7, 44(1)
; LE32-NEXT: stw 7, 40(1)
; LE32-NEXT: stw 7, 36(1)
; LE32-NEXT: stw 8, 16(1)
+; LE32-NEXT: clrlwi 4, 4, 27
; LE32-NEXT: lwzux 5, 9, 6
-; LE32-NEXT: li 6, 7
-; LE32-NEXT: lwz 7, 8(9)
-; LE32-NEXT: nand 6, 4, 6
-; LE32-NEXT: lwz 8, 4(9)
-; LE32-NEXT: clrlwi 4, 4, 29
-; LE32-NEXT: lwz 9, 12(9)
-; LE32-NEXT: clrlwi 6, 6, 27
+; LE32-NEXT: lwz 6, 8(9)
+; LE32-NEXT: lwz 7, 4(9)
+; LE32-NEXT: lwz 8, 12(9)
+; LE32-NEXT: xori 9, 4, 31
; LE32-NEXT: subfic 11, 4, 32
; LE32-NEXT: srw 5, 5, 4
-; LE32-NEXT: slwi 10, 7, 1
-; LE32-NEXT: srw 7, 7, 4
-; LE32-NEXT: slw 6, 10, 6
-; LE32-NEXT: srw 10, 8, 4
-; LE32-NEXT: slw 8, 8, 11
-; LE32-NEXT: slw 11, 9, 11
-; LE32-NEXT: srw 4, 9, 4
-; LE32-NEXT: or 5, 8, 5
-; LE32-NEXT: or 7, 11, 7
-; LE32-NEXT: or 6, 10, 6
+; LE32-NEXT: slwi 10, 6, 1
+; LE32-NEXT: srw 6, 6, 4
+; LE32-NEXT: slw 9, 10, 9
+; LE32-NEXT: srw 10, 7, 4
+; LE32-NEXT: slw 7, 7, 11
+; LE32-NEXT: slw 11, 8, 11
+; LE32-NEXT: srw 4, 8, 4
+; LE32-NEXT: or 5, 7, 5
+; LE32-NEXT: or 6, 11, 6
+; LE32-NEXT: or 7, 10, 9
; LE32-NEXT: stw 4, 12(3)
-; LE32-NEXT: stw 7, 8(3)
+; LE32-NEXT: stw 6, 8(3)
; LE32-NEXT: stw 5, 0(3)
-; LE32-NEXT: stw 6, 4(3)
+; LE32-NEXT: stw 7, 4(3)
; LE32-NEXT: addi 1, 1, 80
; LE32-NEXT: blr
;
@@ -89,37 +88,33 @@ define void @pr59074(ptr %0) {
; BE32-NEXT: li 6, 12
; BE32-NEXT: li 7, 0
; BE32-NEXT: addi 8, 1, -48
-; BE32-NEXT: li 10, 7
; BE32-NEXT: stxvw4x 0, 0, 5
-; BE32-NEXT: addi 4, 4, -12
; BE32-NEXT: stw 6, -36(1)
+; BE32-NEXT: addi 4, 4, -12
; BE32-NEXT: stw 7, -40(1)
; BE32-NEXT: stw 7, -44(1)
-; BE32-NEXT: rlwinm 9, 4, 29, 28, 31
; BE32-NEXT: stw 7, -48(1)
+; BE32-NEXT: rlwinm 9, 4, 29, 28, 29
+; BE32-NEXT: clrlwi 4, 4, 27
; BE32-NEXT: sub 5, 8, 9
-; BE32-NEXT: nand 6, 4, 10
-; BE32-NEXT: clrlwi 4, 4, 29
-; BE32-NEXT: clrlwi 6, 6, 27
-; BE32-NEXT: lwz 7, 4(5)
-; BE32-NEXT: lwz 8, 8(5)
-; BE32-NEXT: lwz 9, 0(5)
-; BE32-NEXT: lwz 5, 12(5)
-; BE32-NEXT: slwi 10, 7, 1
-; BE32-NEXT: srw 11, 8, 4
-; BE32-NEXT: srw 7, 7, 4
-; BE32-NEXT: srw 5, 5, 4
-; BE32-NEXT: slw 6, 10, 6
+; BE32-NEXT: lwz 6, 4(5)
+; BE32-NEXT: lwz 7, 0(5)
+; BE32-NEXT: lwz 8, 12(5)
+; BE32-NEXT: lwz 5, 8(5)
; BE32-NEXT: subfic 10, 4, 32
-; BE32-NEXT: srw 4, 9, 4
-; BE32-NEXT: slw 8, 8, 10
-; BE32-NEXT: slw 10, 9, 10
-; BE32-NEXT: or 6, 11, 6
-; BE32-NEXT: or 7, 10, 7
-; BE32-NEXT: or 5, 8, 5
+; BE32-NEXT: srw 9, 6, 4
+; BE32-NEXT: slw 11, 7, 10
+; BE32-NEXT: srw 8, 8, 4
+; BE32-NEXT: slw 6, 6, 10
+; BE32-NEXT: slw 10, 5, 10
+; BE32-NEXT: srw 5, 5, 4
+; BE32-NEXT: srw 4, 7, 4
+; BE32-NEXT: or 7, 11, 9
+; BE32-NEXT: or 8, 10, 8
+; BE32-NEXT: or 5, 6, 5
; BE32-NEXT: stw 4, 0(3)
-; BE32-NEXT: stw 6, 8(3)
-; BE32-NEXT: stw 5, 12(3)
+; BE32-NEXT: stw 5, 8(3)
+; BE32-NEXT: stw 8, 12(3)
; BE32-NEXT: stw 7, 4(3)
; BE32-NEXT: blr
entry:
diff --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
index f6fdb4ae207947..4f1b7bdc8b552a 100644
--- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -233,9 +233,96 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 9, 8(3)
; LE-32BIT-NEXT: lwz 3, 12(3)
; LE-32BIT-NEXT: lwz 4, 12(4)
+; LE-32BIT-NEXT: stw 6, 28(1)
+; LE-32BIT-NEXT: stw 6, 24(1)
+; LE-32BIT-NEXT: stw 6, 20(1)
+; LE-32BIT-NEXT: stw 6, 16(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 0, 28, 29
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: addi 3, 1, 32
+; LE-32BIT-NEXT: stw 9, 40(1)
+; LE-32BIT-NEXT: sub 3, 3, 6
+; LE-32BIT-NEXT: stw 8, 36(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 7, 32(1)
+; LE-32BIT-NEXT: subfic 9, 4, 32
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 10, 6, 4
+; LE-32BIT-NEXT: lwz 3, 8(3)
+; LE-32BIT-NEXT: slw 11, 7, 9
+; LE-32BIT-NEXT: slw 6, 6, 9
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 9, 3, 9
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: or 3, 6, 3
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 9, 8
+; LE-32BIT-NEXT: srw 4, 7, 4
+; LE-32BIT-NEXT: stw 3, 12(5)
+; LE-32BIT-NEXT: or 3, 11, 10
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: addi 1, 1, 48
+; LE-32BIT-NEXT: blr
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = lshr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: lshr_16bytes_wordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: lwz 4, 0(4)
+; LE-64BIT-NEXT: ld 6, 8(3)
+; LE-64BIT-NEXT: ld 3, 0(3)
+; LE-64BIT-NEXT: slwi 4, 4, 5
+; LE-64BIT-NEXT: subfic 7, 4, 64
+; LE-64BIT-NEXT: srd 3, 3, 4
+; LE-64BIT-NEXT: sld 7, 6, 7
+; LE-64BIT-NEXT: or 3, 3, 7
+; LE-64BIT-NEXT: addi 7, 4, -64
+; LE-64BIT-NEXT: srd 4, 6, 4
+; LE-64BIT-NEXT: srd 7, 6, 7
+; LE-64BIT-NEXT: std 4, 8(5)
+; LE-64BIT-NEXT: or 3, 3, 7
+; LE-64BIT-NEXT: std 3, 0(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: lshr_16bytes_wordOff:
+; BE: # %bb.0:
+; BE-NEXT: lwz 4, 12(4)
+; BE-NEXT: ld 6, 0(3)
+; BE-NEXT: ld 3, 8(3)
+; BE-NEXT: slwi 4, 4, 5
+; BE-NEXT: subfic 7, 4, 64
+; BE-NEXT: srd 3, 3, 4
+; BE-NEXT: sld 7, 6, 7
+; BE-NEXT: addi 8, 4, -64
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: srd 7, 6, 8
+; BE-NEXT: srd 4, 6, 4
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: std 4, 0(5)
+; BE-NEXT: std 3, 8(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: lshr_16bytes_wordOff:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -48(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: li 6, 0
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 3, 12(3)
+; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: stw 3, 44(1)
; LE-32BIT-NEXT: addi 3, 1, 32
-; LE-32BIT-NEXT: clrlwi 4, 4, 28
+; LE-32BIT-NEXT: rlwinm 4, 4, 2, 28, 29
; LE-32BIT-NEXT: stw 6, 28(1)
; LE-32BIT-NEXT: sub 3, 3, 4
; LE-32BIT-NEXT: stw 6, 24(1)
@@ -255,12 +342,13 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
- %byteOff = load i128, ptr %byteOff.ptr, align 1
- %bitOff = shl i128 %byteOff, 3
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
%res = lshr i128 %src, %bitOff
store i128 %res, ptr %dst, align 1
ret void
}
+
define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-LABEL: shl_16bytes:
; LE-64BIT: # %bb.0:
@@ -309,7 +397,93 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: stw 6, 44(1)
; LE-32BIT-NEXT: stw 6, 40(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 28
+; LE-32BIT-NEXT: stw 6, 36(1)
+; LE-32BIT-NEXT: stw 6, 32(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 0, 28, 29
+; LE-32BIT-NEXT: stw 3, 28(1)
+; LE-32BIT-NEXT: addi 3, 1, 16
+; LE-32BIT-NEXT: stw 9, 24(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 8, 20(1)
+; LE-32BIT-NEXT: subfic 8, 4, 32
+; LE-32BIT-NEXT: stw 7, 16(1)
+; LE-32BIT-NEXT: lwzux 3, 6, 3
+; LE-32BIT-NEXT: lwz 9, 4(6)
+; LE-32BIT-NEXT: slw 3, 3, 4
+; LE-32BIT-NEXT: lwz 7, 8(6)
+; LE-32BIT-NEXT: lwz 6, 12(6)
+; LE-32BIT-NEXT: slw 11, 9, 4
+; LE-32BIT-NEXT: srw 9, 9, 8
+; LE-32BIT-NEXT: srw 10, 7, 8
+; LE-32BIT-NEXT: srw 8, 6, 8
+; LE-32BIT-NEXT: slw 7, 7, 4
+; LE-32BIT-NEXT: slw 4, 6, 4
+; LE-32BIT-NEXT: or 3, 3, 9
+; LE-32BIT-NEXT: stw 4, 12(5)
+; LE-32BIT-NEXT: or 4, 7, 8
+; LE-32BIT-NEXT: stw 3, 0(5)
+; LE-32BIT-NEXT: or 3, 11, 10
+; LE-32BIT-NEXT: stw 4, 8(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: addi 1, 1, 48
+; LE-32BIT-NEXT: blr
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = shl i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: shl_16bytes_wordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: lwz 4, 0(4)
+; LE-64BIT-NEXT: ld 6, 0(3)
+; LE-64BIT-NEXT: ld 3, 8(3)
+; LE-64BIT-NEXT: slwi 4, 4, 5
+; LE-64BIT-NEXT: subfic 7, 4, 64
+; LE-64BIT-NEXT: sld 3, 3, 4
+; LE-64BIT-NEXT: srd 7, 6, 7
+; LE-64BIT-NEXT: or 3, 3, 7
+; LE-64BIT-NEXT: addi 7, 4, -64
+; LE-64BIT-NEXT: sld 4, 6, 4
+; LE-64BIT-NEXT: sld 7, 6, 7
+; LE-64BIT-NEXT: std 4, 0(5)
+; LE-64BIT-NEXT: or 3, 3, 7
+; LE-64BIT-NEXT: std 3, 8(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: shl_16bytes_wordOff:
+; BE: # %bb.0:
+; BE-NEXT: lwz 4, 12(4)
+; BE-NEXT: ld 6, 8(3)
+; BE-NEXT: ld 3, 0(3)
+; BE-NEXT: slwi 4, 4, 5
+; BE-NEXT: subfic 7, 4, 64
+; BE-NEXT: sld 3, 3, 4
+; BE-NEXT: srd 7, 6, 7
+; BE-NEXT: addi 8, 4, -64
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: sld 7, 6, 8
+; BE-NEXT: sld 4, 6, 4
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: std 4, 8(5)
+; BE-NEXT: std 3, 0(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: shl_16bytes_wordOff:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -48(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: li 6, 0
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 3, 12(3)
+; LE-32BIT-NEXT: lwz 4, 12(4)
+; LE-32BIT-NEXT: stw 6, 44(1)
+; LE-32BIT-NEXT: stw 6, 40(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 2, 28, 29
; LE-32BIT-NEXT: stw 6, 36(1)
; LE-32BIT-NEXT: stw 6, 32(1)
; LE-32BIT-NEXT: stw 3, 28(1)
@@ -328,12 +502,13 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
- %byteOff = load i128, ptr %byteOff.ptr, align 1
- %bitOff = shl i128 %byteOff, 3
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
%res = shl i128 %src, %bitOff
store i128 %res, ptr %dst, align 1
ret void
}
+
define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-LABEL: ashr_16bytes:
; LE-64BIT: # %bb.0:
@@ -361,17 +536,17 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; BE-NEXT: slwi 4, 4, 3
; BE-NEXT: addi 7, 4, -64
; BE-NEXT: cmpwi 7, 1
-; BE-NEXT: blt 0, .LBB8_2
+; BE-NEXT: blt 0, .LBB10_2
; BE-NEXT: # %bb.1:
; BE-NEXT: srad 3, 6, 7
-; BE-NEXT: b .LBB8_3
-; BE-NEXT: .LBB8_2:
+; BE-NEXT: b .LBB10_3
+; BE-NEXT: .LBB10_2:
; BE-NEXT: ld 3, 8(3)
; BE-NEXT: subfic 7, 4, 64
; BE-NEXT: sld 7, 6, 7
; BE-NEXT: srd 3, 3, 4
; BE-NEXT: or 3, 3, 7
-; BE-NEXT: .LBB8_3:
+; BE-NEXT: .LBB10_3:
; BE-NEXT: srad 4, 6, 4
; BE-NEXT: std 3, 8(5)
; BE-NEXT: std 4, 0(5)
@@ -388,7 +563,100 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: stw 3, 44(1)
; LE-32BIT-NEXT: srawi 3, 7, 31
-; LE-32BIT-NEXT: clrlwi 4, 4, 28
+; LE-32BIT-NEXT: stw 7, 32(1)
+; LE-32BIT-NEXT: rlwinm 7, 4, 0, 28, 29
+; LE-32BIT-NEXT: stw 9, 40(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 8, 36(1)
+; LE-32BIT-NEXT: subfic 9, 4, 32
+; LE-32BIT-NEXT: stw 3, 28(1)
+; LE-32BIT-NEXT: stw 3, 24(1)
+; LE-32BIT-NEXT: stw 3, 20(1)
+; LE-32BIT-NEXT: stw 3, 16(1)
+; LE-32BIT-NEXT: sub 3, 6, 7
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 10, 6, 4
+; LE-32BIT-NEXT: lwz 3, 8(3)
+; LE-32BIT-NEXT: slw 11, 7, 9
+; LE-32BIT-NEXT: slw 6, 6, 9
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 9, 3, 9
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: or 3, 6, 3
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 9, 8
+; LE-32BIT-NEXT: sraw 4, 7, 4
+; LE-32BIT-NEXT: stw 3, 12(5)
+; LE-32BIT-NEXT: or 3, 11, 10
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: addi 1, 1, 48
+; LE-32BIT-NEXT: blr
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = ashr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: ashr_16bytes_wordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: lwz 4, 0(4)
+; LE-64BIT-NEXT: ld 6, 8(3)
+; LE-64BIT-NEXT: ld 3, 0(3)
+; LE-64BIT-NEXT: slwi 4, 4, 5
+; LE-64BIT-NEXT: subfic 7, 4, 64
+; LE-64BIT-NEXT: srd 3, 3, 4
+; LE-64BIT-NEXT: sld 7, 6, 7
+; LE-64BIT-NEXT: or 3, 3, 7
+; LE-64BIT-NEXT: addi 7, 4, -64
+; LE-64BIT-NEXT: srad 4, 6, 4
+; LE-64BIT-NEXT: cmpwi 7, 1
+; LE-64BIT-NEXT: srad 8, 6, 7
+; LE-64BIT-NEXT: std 4, 8(5)
+; LE-64BIT-NEXT: isellt 3, 3, 8
+; LE-64BIT-NEXT: std 3, 0(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: ashr_16bytes_wordOff:
+; BE: # %bb.0:
+; BE-NEXT: lwz 4, 12(4)
+; BE-NEXT: ld 6, 0(3)
+; BE-NEXT: slwi 4, 4, 5
+; BE-NEXT: addi 7, 4, -64
+; BE-NEXT: cmpwi 7, 1
+; BE-NEXT: blt 0, .LBB11_2
+; BE-NEXT: # %bb.1:
+; BE-NEXT: srad 3, 6, 7
+; BE-NEXT: b .LBB11_3
+; BE-NEXT: .LBB11_2:
+; BE-NEXT: ld 3, 8(3)
+; BE-NEXT: subfic 7, 4, 64
+; BE-NEXT: sld 7, 6, 7
+; BE-NEXT: srd 3, 3, 4
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: .LBB11_3:
+; BE-NEXT: srad 4, 6, 4
+; BE-NEXT: std 3, 8(5)
+; BE-NEXT: std 4, 0(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: ashr_16bytes_wordOff:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -48(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: addi 6, 1, 32
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 3, 12(3)
+; LE-32BIT-NEXT: lwz 4, 12(4)
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: srawi 3, 7, 31
+; LE-32BIT-NEXT: rlwinm 4, 4, 2, 28, 29
; LE-32BIT-NEXT: stw 9, 40(1)
; LE-32BIT-NEXT: stw 8, 36(1)
; LE-32BIT-NEXT: stw 7, 32(1)
@@ -408,8 +676,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
- %byteOff = load i128, ptr %byteOff.ptr, align 1
- %bitOff = shl i128 %byteOff, 3
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
%res = ashr i128 %src, %bitOff
store i128 %res, ptr %dst, align 1
ret void
@@ -422,20 +690,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-NEXT: lxvd2x 1, 0, 3
; LE-64BIT-NEXT: xxlxor 2, 2, 2
; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: li 8, 32
; LE-64BIT-NEXT: lxvd2x 0, 3, 6
; LE-64BIT-NEXT: lwz 3, 0(4)
; LE-64BIT-NEXT: li 4, 48
; LE-64BIT-NEXT: stxvd2x 2, 7, 4
-; LE-64BIT-NEXT: li 4, 32
-; LE-64BIT-NEXT: clrldi 3, 3, 59
-; LE-64BIT-NEXT: stxvd2x 2, 7, 4
+; LE-64BIT-NEXT: stxvd2x 2, 7, 8
+; LE-64BIT-NEXT: rlwinm 4, 3, 0, 27, 28
+; LE-64BIT-NEXT: rlwinm 3, 3, 3, 26, 28
; LE-64BIT-NEXT: stxvd2x 0, 7, 6
; LE-64BIT-NEXT: stxvd2x 1, 0, 7
-; LE-64BIT-NEXT: lxvd2x 0, 7, 3
-; LE-64BIT-NEXT: add 3, 7, 3
-; LE-64BIT-NEXT: lxvd2x 1, 3, 6
-; LE-64BIT-NEXT: stxvd2x 1, 5, 6
-; LE-64BIT-NEXT: stxvd2x 0, 0, 5
+; LE-64BIT-NEXT: ldux 6, 4, 7
+; LE-64BIT-NEXT: subfic 7, 3, 64
+; LE-64BIT-NEXT: ld 8, 8(4)
+; LE-64BIT-NEXT: ld 9, 16(4)
+; LE-64BIT-NEXT: ld 4, 24(4)
+; LE-64BIT-NEXT: srd 6, 6, 3
+; LE-64BIT-NEXT: sld 10, 8, 7
+; LE-64BIT-NEXT: sld 11, 4, 7
+; LE-64BIT-NEXT: srd 8, 8, 3
+; LE-64BIT-NEXT: sld 7, 9, 7
+; LE-64BIT-NEXT: or 6, 10, 6
+; LE-64BIT-NEXT: srd 10, 9, 3
+; LE-64BIT-NEXT: srd 3, 4, 3
+; LE-64BIT-NEXT: or 7, 7, 8
+; LE-64BIT-NEXT: std 3, 24(5)
+; LE-64BIT-NEXT: or 3, 11, 10
+; LE-64BIT-NEXT: std 7, 8(5)
+; LE-64BIT-NEXT: std 6, 0(5)
+; LE-64BIT-NEXT: std 3, 16(5)
; LE-64BIT-NEXT: blr
;
; BE-LABEL: lshr_32bytes:
@@ -445,33 +728,44 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; BE-NEXT: ld 8, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: li 10, 0
-; BE-NEXT: std 10, 24(9)
-; BE-NEXT: std 10, 16(9)
-; BE-NEXT: std 10, 8(9)
-; BE-NEXT: std 10, -64(1)
-; BE-NEXT: std 3, 56(9)
-; BE-NEXT: clrlwi 3, 4, 27
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -32
+; BE-NEXT: std 9, -40(1)
+; BE-NEXT: std 9, -48(1)
+; BE-NEXT: std 9, -56(1)
+; BE-NEXT: std 9, -64(1)
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: rlwinm 3, 4, 0, 27, 28
; BE-NEXT: neg 3, 3
-; BE-NEXT: std 8, 48(9)
-; BE-NEXT: std 7, 40(9)
-; BE-NEXT: std 6, 32(9)
+; BE-NEXT: std 8, -16(1)
+; BE-NEXT: std 7, -24(1)
+; BE-NEXT: std 6, -32(1)
; BE-NEXT: extsw 3, 3
-; BE-NEXT: addi 4, 1, -32
-; BE-NEXT: ldux 3, 4, 3
-; BE-NEXT: ld 6, 8(4)
-; BE-NEXT: ld 7, 24(4)
-; BE-NEXT: ld 4, 16(4)
+; BE-NEXT: ldux 3, 10, 3
+; BE-NEXT: rlwinm 4, 4, 3, 26, 28
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 6, 8(10)
+; BE-NEXT: ld 7, 24(10)
+; BE-NEXT: ld 8, 16(10)
+; BE-NEXT: sld 10, 3, 9
+; BE-NEXT: srd 3, 3, 4
; BE-NEXT: std 3, 0(5)
-; BE-NEXT: std 4, 16(5)
+; BE-NEXT: srd 11, 6, 4
+; BE-NEXT: srd 7, 7, 4
+; BE-NEXT: sld 6, 6, 9
+; BE-NEXT: sld 9, 8, 9
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 7, 9, 7
+; BE-NEXT: or 6, 6, 8
+; BE-NEXT: std 6, 16(5)
; BE-NEXT: std 7, 24(5)
-; BE-NEXT: std 6, 8(5)
+; BE-NEXT: std 10, 8(5)
; BE-NEXT: blr
;
; LE-32BIT-LABEL: lshr_32bytes:
; LE-32BIT: # %bb.0:
-; LE-32BIT-NEXT: stwu 1, -80(1)
+; LE-32BIT-NEXT: stwu 1, -112(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
; LE-32BIT-NEXT: li 6, 0
; LE-32BIT-NEXT: lwz 8, 4(3)
@@ -482,11 +776,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 0, 24(3)
; LE-32BIT-NEXT: lwz 3, 28(3)
; LE-32BIT-NEXT: lwz 4, 28(4)
-; LE-32BIT-NEXT: stw 3, 76(1)
-; LE-32BIT-NEXT: addi 3, 1, 48
-; LE-32BIT-NEXT: clrlwi 4, 4, 27
; LE-32BIT-NEXT: stw 6, 44(1)
-; LE-32BIT-NEXT: sub 3, 3, 4
; LE-32BIT-NEXT: stw 6, 40(1)
; LE-32BIT-NEXT: stw 6, 36(1)
; LE-32BIT-NEXT: stw 6, 32(1)
@@ -494,30 +784,70 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 6, 24(1)
; LE-32BIT-NEXT: stw 6, 20(1)
; LE-32BIT-NEXT: stw 6, 16(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 0, 27, 29
+; LE-32BIT-NEXT: stw 3, 76(1)
+; LE-32BIT-NEXT: addi 3, 1, 48
+; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: sub 3, 3, 6
+; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 0, 72(1)
+; LE-32BIT-NEXT: subfic 0, 4, 32
; LE-32BIT-NEXT: stw 12, 68(1)
; LE-32BIT-NEXT: stw 11, 64(1)
; LE-32BIT-NEXT: stw 10, 60(1)
; LE-32BIT-NEXT: stw 9, 56(1)
; LE-32BIT-NEXT: stw 8, 52(1)
; LE-32BIT-NEXT: stw 7, 48(1)
-; LE-32BIT-NEXT: lwz 4, 4(3)
-; LE-32BIT-NEXT: lwz 6, 0(3)
-; LE-32BIT-NEXT: lwz 7, 12(3)
-; LE-32BIT-NEXT: lwz 8, 8(3)
-; LE-32BIT-NEXT: lwz 9, 20(3)
-; LE-32BIT-NEXT: lwz 10, 16(3)
-; LE-32BIT-NEXT: lwz 11, 24(3)
-; LE-32BIT-NEXT: lwz 3, 28(3)
-; LE-32BIT-NEXT: stw 11, 24(5)
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 30, 6, 4
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: slw 29, 7, 0
+; LE-32BIT-NEXT: lwz 10, 20(3)
+; LE-32BIT-NEXT: srw 28, 8, 4
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: slw 27, 9, 0
+; LE-32BIT-NEXT: lwz 12, 28(3)
+; LE-32BIT-NEXT: slw 6, 6, 0
+; LE-32BIT-NEXT: lwz 3, 24(3)
+; LE-32BIT-NEXT: srw 26, 10, 4
+; LE-32BIT-NEXT: slw 25, 11, 0
+; LE-32BIT-NEXT: slw 8, 8, 0
+; LE-32BIT-NEXT: slw 10, 10, 0
+; LE-32BIT-NEXT: slw 0, 3, 0
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: srw 12, 12, 4
+; LE-32BIT-NEXT: or 3, 10, 3
+; LE-32BIT-NEXT: srw 11, 11, 4
+; LE-32BIT-NEXT: stw 3, 24(5)
+; LE-32BIT-NEXT: or 3, 0, 12
; LE-32BIT-NEXT: stw 3, 28(5)
-; LE-32BIT-NEXT: stw 10, 16(5)
-; LE-32BIT-NEXT: stw 9, 20(5)
-; LE-32BIT-NEXT: stw 8, 8(5)
-; LE-32BIT-NEXT: stw 7, 12(5)
-; LE-32BIT-NEXT: stw 6, 0(5)
-; LE-32BIT-NEXT: stw 4, 4(5)
-; LE-32BIT-NEXT: addi 1, 1, 80
+; LE-32BIT-NEXT: or 3, 8, 11
+; LE-32BIT-NEXT: srw 9, 9, 4
+; LE-32BIT-NEXT: stw 3, 16(5)
+; LE-32BIT-NEXT: or 3, 25, 26
+; LE-32BIT-NEXT: stw 3, 20(5)
+; LE-32BIT-NEXT: or 3, 6, 9
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 27, 28
+; LE-32BIT-NEXT: srw 4, 7, 4
+; LE-32BIT-NEXT: stw 3, 12(5)
+; LE-32BIT-NEXT: or 3, 29, 30
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: addi 1, 1, 112
; LE-32BIT-NEXT: blr
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -526,32 +856,297 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
store i256 %res, ptr %dst, align 1
ret void
}
-define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; LE-64BIT-LABEL: shl_32bytes:
+
+define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: lshr_32bytes_wordOff:
; LE-64BIT: # %bb.0:
; LE-64BIT-NEXT: li 6, 16
; LE-64BIT-NEXT: lxvd2x 1, 0, 3
; LE-64BIT-NEXT: xxlxor 2, 2, 2
-; LE-64BIT-NEXT: li 7, 48
+; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: li 8, 32
; LE-64BIT-NEXT: lxvd2x 0, 3, 6
; LE-64BIT-NEXT: lwz 3, 0(4)
-; LE-64BIT-NEXT: addi 4, 1, -64
-; LE-64BIT-NEXT: stxvd2x 2, 4, 6
-; LE-64BIT-NEXT: clrlwi 3, 3, 27
-; LE-64BIT-NEXT: stxvd2x 0, 4, 7
-; LE-64BIT-NEXT: li 7, 32
-; LE-64BIT-NEXT: neg 3, 3
-; LE-64BIT-NEXT: stxvd2x 1, 4, 7
-; LE-64BIT-NEXT: stxvd2x 2, 0, 4
-; LE-64BIT-NEXT: extsw 3, 3
-; LE-64BIT-NEXT: addi 4, 1, -32
-; LE-64BIT-NEXT: lxvd2x 0, 4, 3
-; LE-64BIT-NEXT: add 3, 4, 3
+; LE-64BIT-NEXT: li 4, 48
+; LE-64BIT-NEXT: stxvd2x 2, 7, 4
+; LE-64BIT-NEXT: stxvd2x 2, 7, 8
+; LE-64BIT-NEXT: rlwinm 4, 3, 2, 27, 28
+; LE-64BIT-NEXT: rlwinm 3, 3, 5, 26, 26
+; LE-64BIT-NEXT: stxvd2x 0, 7, 6
+; LE-64BIT-NEXT: stxvd2x 1, 0, 7
+; LE-64BIT-NEXT: ldux 6, 4, 7
+; LE-64BIT-NEXT: subfic 7, 3, 64
+; LE-64BIT-NEXT: ld 8, 8(4)
+; LE-64BIT-NEXT: ld 9, 16(4)
+; LE-64BIT-NEXT: ld 4, 24(4)
+; LE-64BIT-NEXT: srd 6, 6, 3
+; LE-64BIT-NEXT: sld 10, 8, 7
+; LE-64BIT-NEXT: sld 11, 4, 7
+; LE-64BIT-NEXT: srd 8, 8, 3
+; LE-64BIT-NEXT: sld 7, 9, 7
+; LE-64BIT-NEXT: or 6, 10, 6
+; LE-64BIT-NEXT: srd 10, 9, 3
+; LE-64BIT-NEXT: srd 3, 4, 3
+; LE-64BIT-NEXT: or 7, 7, 8
+; LE-64BIT-NEXT: std 3, 24(5)
+; LE-64BIT-NEXT: or 3, 11, 10
+; LE-64BIT-NEXT: std 7, 8(5)
+; LE-64BIT-NEXT: std 6, 0(5)
+; LE-64BIT-NEXT: std 3, 16(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: lshr_32bytes_wordOff:
+; BE: # %bb.0:
+; BE-NEXT: ld 6, 0(3)
+; BE-NEXT: ld 7, 8(3)
+; BE-NEXT: ld 8, 16(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: lwz 4, 28(4)
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -32
+; BE-NEXT: std 9, -40(1)
+; BE-NEXT: std 9, -48(1)
+; BE-NEXT: std 9, -56(1)
+; BE-NEXT: std 9, -64(1)
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: rlwinm 3, 4, 2, 27, 28
+; BE-NEXT: neg 3, 3
+; BE-NEXT: std 8, -16(1)
+; BE-NEXT: std 7, -24(1)
+; BE-NEXT: std 6, -32(1)
+; BE-NEXT: extsw 3, 3
+; BE-NEXT: ldux 3, 10, 3
+; BE-NEXT: rlwinm 4, 4, 5, 26, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 6, 8(10)
+; BE-NEXT: ld 7, 24(10)
+; BE-NEXT: ld 8, 16(10)
+; BE-NEXT: sld 10, 3, 9
+; BE-NEXT: srd 3, 3, 4
+; BE-NEXT: std 3, 0(5)
+; BE-NEXT: srd 11, 6, 4
+; BE-NEXT: srd 7, 7, 4
+; BE-NEXT: sld 6, 6, 9
+; BE-NEXT: sld 9, 8, 9
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 7, 9, 7
+; BE-NEXT: or 6, 6, 8
+; BE-NEXT: std 6, 16(5)
+; BE-NEXT: std 7, 24(5)
+; BE-NEXT: std 10, 8(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: lshr_32bytes_wordOff:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -80(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: li 6, 0
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 10, 12(3)
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: lwz 12, 20(3)
+; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: lwz 4, 28(4)
+; LE-32BIT-NEXT: stw 3, 76(1)
+; LE-32BIT-NEXT: addi 3, 1, 48
+; LE-32BIT-NEXT: rlwinm 4, 4, 2, 27, 29
+; LE-32BIT-NEXT: stw 6, 44(1)
+; LE-32BIT-NEXT: sub 3, 3, 4
+; LE-32BIT-NEXT: stw 6, 40(1)
+; LE-32BIT-NEXT: stw 6, 36(1)
+; LE-32BIT-NEXT: stw 6, 32(1)
+; LE-32BIT-NEXT: stw 6, 28(1)
+; LE-32BIT-NEXT: stw 6, 24(1)
+; LE-32BIT-NEXT: stw 6, 20(1)
+; LE-32BIT-NEXT: stw 6, 16(1)
+; LE-32BIT-NEXT: stw 0, 72(1)
+; LE-32BIT-NEXT: stw 12, 68(1)
+; LE-32BIT-NEXT: stw 11, 64(1)
+; LE-32BIT-NEXT: stw 10, 60(1)
+; LE-32BIT-NEXT: stw 9, 56(1)
+; LE-32BIT-NEXT: stw 8, 52(1)
+; LE-32BIT-NEXT: stw 7, 48(1)
+; LE-32BIT-NEXT: lwz 4, 4(3)
+; LE-32BIT-NEXT: lwz 6, 0(3)
+; LE-32BIT-NEXT: lwz 7, 12(3)
+; LE-32BIT-NEXT: lwz 8, 8(3)
+; LE-32BIT-NEXT: lwz 9, 20(3)
+; LE-32BIT-NEXT: lwz 10, 16(3)
+; LE-32BIT-NEXT: lwz 11, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: stw 11, 24(5)
+; LE-32BIT-NEXT: stw 3, 28(5)
+; LE-32BIT-NEXT: stw 10, 16(5)
+; LE-32BIT-NEXT: stw 9, 20(5)
+; LE-32BIT-NEXT: stw 8, 8(5)
+; LE-32BIT-NEXT: stw 7, 12(5)
+; LE-32BIT-NEXT: stw 6, 0(5)
+; LE-32BIT-NEXT: stw 4, 4(5)
+; LE-32BIT-NEXT: addi 1, 1, 80
+; LE-32BIT-NEXT: blr
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: lshr_32bytes_dwordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: li 6, 16
+; LE-64BIT-NEXT: lxvd2x 1, 0, 3
+; LE-64BIT-NEXT: xxlxor 2, 2, 2
+; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: lxvd2x 0, 3, 6
+; LE-64BIT-NEXT: lwz 3, 0(4)
+; LE-64BIT-NEXT: li 4, 48
+; LE-64BIT-NEXT: stxvd2x 2, 7, 4
+; LE-64BIT-NEXT: li 4, 32
+; LE-64BIT-NEXT: rlwinm 3, 3, 3, 27, 28
+; LE-64BIT-NEXT: stxvd2x 2, 7, 4
+; LE-64BIT-NEXT: stxvd2x 0, 7, 6
+; LE-64BIT-NEXT: stxvd2x 1, 0, 7
+; LE-64BIT-NEXT: lxvd2x 0, 7, 3
+; LE-64BIT-NEXT: add 3, 7, 3
; LE-64BIT-NEXT: lxvd2x 1, 3, 6
; LE-64BIT-NEXT: stxvd2x 1, 5, 6
; LE-64BIT-NEXT: stxvd2x 0, 0, 5
; LE-64BIT-NEXT: blr
;
+; BE-LABEL: lshr_32bytes_dwordOff:
+; BE: # %bb.0:
+; BE-NEXT: ld 7, 0(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 9, 16(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: lwz 4, 28(4)
+; BE-NEXT: li 6, 0
+; BE-NEXT: std 6, -40(1)
+; BE-NEXT: std 6, -48(1)
+; BE-NEXT: std 6, -56(1)
+; BE-NEXT: std 6, -64(1)
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: rlwinm 3, 4, 3, 27, 28
+; BE-NEXT: neg 3, 3
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 8, -24(1)
+; BE-NEXT: std 7, -32(1)
+; BE-NEXT: extsw 3, 3
+; BE-NEXT: addi 4, 1, -32
+; BE-NEXT: ldux 3, 4, 3
+; BE-NEXT: ld 6, 8(4)
+; BE-NEXT: ld 7, 24(4)
+; BE-NEXT: ld 4, 16(4)
+; BE-NEXT: std 3, 0(5)
+; BE-NEXT: std 4, 16(5)
+; BE-NEXT: std 7, 24(5)
+; BE-NEXT: std 6, 8(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: lshr_32bytes_dwordOff:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -80(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: li 6, 0
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 10, 12(3)
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: lwz 12, 20(3)
+; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: lwz 4, 28(4)
+; LE-32BIT-NEXT: stw 3, 76(1)
+; LE-32BIT-NEXT: addi 3, 1, 48
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 6, 44(1)
+; LE-32BIT-NEXT: sub 3, 3, 4
+; LE-32BIT-NEXT: stw 6, 40(1)
+; LE-32BIT-NEXT: stw 6, 36(1)
+; LE-32BIT-NEXT: stw 6, 32(1)
+; LE-32BIT-NEXT: stw 6, 28(1)
+; LE-32BIT-NEXT: stw 6, 24(1)
+; LE-32BIT-NEXT: stw 6, 20(1)
+; LE-32BIT-NEXT: stw 6, 16(1)
+; LE-32BIT-NEXT: stw 0, 72(1)
+; LE-32BIT-NEXT: stw 12, 68(1)
+; LE-32BIT-NEXT: stw 11, 64(1)
+; LE-32BIT-NEXT: stw 10, 60(1)
+; LE-32BIT-NEXT: stw 9, 56(1)
+; LE-32BIT-NEXT: stw 8, 52(1)
+; LE-32BIT-NEXT: stw 7, 48(1)
+; LE-32BIT-NEXT: lwz 4, 4(3)
+; LE-32BIT-NEXT: lwz 6, 0(3)
+; LE-32BIT-NEXT: lwz 7, 12(3)
+; LE-32BIT-NEXT: lwz 8, 8(3)
+; LE-32BIT-NEXT: lwz 9, 20(3)
+; LE-32BIT-NEXT: lwz 10, 16(3)
+; LE-32BIT-NEXT: lwz 11, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: stw 11, 24(5)
+; LE-32BIT-NEXT: stw 3, 28(5)
+; LE-32BIT-NEXT: stw 10, 16(5)
+; LE-32BIT-NEXT: stw 9, 20(5)
+; LE-32BIT-NEXT: stw 8, 8(5)
+; LE-32BIT-NEXT: stw 7, 12(5)
+; LE-32BIT-NEXT: stw 6, 0(5)
+; LE-32BIT-NEXT: stw 4, 4(5)
+; LE-32BIT-NEXT: addi 1, 1, 80
+; LE-32BIT-NEXT: blr
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: shl_32bytes:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: li 6, 16
+; LE-64BIT-NEXT: lwz 4, 0(4)
+; LE-64BIT-NEXT: xxlxor 2, 2, 2
+; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: lxvd2x 1, 0, 3
+; LE-64BIT-NEXT: addi 8, 1, -32
+; LE-64BIT-NEXT: lxvd2x 0, 3, 6
+; LE-64BIT-NEXT: stxvd2x 2, 7, 6
+; LE-64BIT-NEXT: li 6, 48
+; LE-64BIT-NEXT: rlwinm 3, 4, 0, 27, 28
+; LE-64BIT-NEXT: rlwinm 4, 4, 3, 26, 28
+; LE-64BIT-NEXT: neg 3, 3
+; LE-64BIT-NEXT: stxvd2x 0, 7, 6
+; LE-64BIT-NEXT: li 6, 32
+; LE-64BIT-NEXT: extsw 3, 3
+; LE-64BIT-NEXT: stxvd2x 1, 7, 6
+; LE-64BIT-NEXT: stxvd2x 2, 0, 7
+; LE-64BIT-NEXT: subfic 6, 4, 64
+; LE-64BIT-NEXT: ldux 3, 8, 3
+; LE-64BIT-NEXT: ld 7, 16(8)
+; LE-64BIT-NEXT: ld 9, 24(8)
+; LE-64BIT-NEXT: ld 8, 8(8)
+; LE-64BIT-NEXT: srd 10, 7, 6
+; LE-64BIT-NEXT: sld 9, 9, 4
+; LE-64BIT-NEXT: sld 7, 7, 4
+; LE-64BIT-NEXT: or 9, 9, 10
+; LE-64BIT-NEXT: srd 10, 8, 6
+; LE-64BIT-NEXT: srd 6, 3, 6
+; LE-64BIT-NEXT: sld 8, 8, 4
+; LE-64BIT-NEXT: sld 3, 3, 4
+; LE-64BIT-NEXT: or 6, 8, 6
+; LE-64BIT-NEXT: std 3, 0(5)
+; LE-64BIT-NEXT: or 3, 7, 10
+; LE-64BIT-NEXT: std 9, 24(5)
+; LE-64BIT-NEXT: std 6, 8(5)
+; LE-64BIT-NEXT: std 3, 16(5)
+; LE-64BIT-NEXT: blr
+;
; BE-LABEL: shl_32bytes:
; BE: # %bb.0:
; BE-NEXT: ld 6, 0(3)
@@ -559,29 +1154,215 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; BE-NEXT: ld 8, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: li 10, 0
-; BE-NEXT: std 10, 56(9)
-; BE-NEXT: std 10, 48(9)
-; BE-NEXT: std 10, 40(9)
-; BE-NEXT: std 10, 32(9)
-; BE-NEXT: std 3, 24(9)
-; BE-NEXT: std 8, 16(9)
-; BE-NEXT: std 7, 8(9)
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -64
+; BE-NEXT: std 9, -8(1)
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 9, -24(1)
+; BE-NEXT: std 9, -32(1)
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 8, -48(1)
+; BE-NEXT: std 7, -56(1)
; BE-NEXT: std 6, -64(1)
-; BE-NEXT: clrldi 3, 4, 59
-; BE-NEXT: ldux 4, 3, 9
-; BE-NEXT: ld 6, 8(3)
-; BE-NEXT: ld 7, 24(3)
-; BE-NEXT: ld 3, 16(3)
-; BE-NEXT: std 4, 0(5)
-; BE-NEXT: std 3, 16(5)
-; BE-NEXT: std 7, 24(5)
-; BE-NEXT: std 6, 8(5)
+; BE-NEXT: rlwinm 3, 4, 0, 27, 28
+; BE-NEXT: ldux 6, 3, 10
+; BE-NEXT: rlwinm 4, 4, 3, 26, 28
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 16(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: sld 6, 6, 4
+; BE-NEXT: srd 10, 7, 9
+; BE-NEXT: sld 11, 8, 4
+; BE-NEXT: srd 8, 8, 9
+; BE-NEXT: srd 9, 3, 9
+; BE-NEXT: sld 7, 7, 4
+; BE-NEXT: sld 3, 3, 4
+; BE-NEXT: or 10, 11, 10
+; BE-NEXT: or 6, 6, 8
+; BE-NEXT: or 7, 7, 9
+; BE-NEXT: std 3, 24(5)
+; BE-NEXT: std 7, 16(5)
+; BE-NEXT: std 6, 0(5)
+; BE-NEXT: std 10, 8(5)
; BE-NEXT: blr
;
; LE-32BIT-LABEL: shl_32bytes:
; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -112(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: li 6, 0
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 10, 12(3)
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: lwz 12, 20(3)
+; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: lwz 4, 28(4)
+; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 6, 76(1)
+; LE-32BIT-NEXT: stw 6, 72(1)
+; LE-32BIT-NEXT: stw 6, 68(1)
+; LE-32BIT-NEXT: stw 6, 64(1)
+; LE-32BIT-NEXT: stw 6, 60(1)
+; LE-32BIT-NEXT: stw 6, 56(1)
+; LE-32BIT-NEXT: stw 6, 52(1)
+; LE-32BIT-NEXT: stw 6, 48(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 0, 27, 29
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: addi 3, 1, 16
+; LE-32BIT-NEXT: stw 0, 40(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 12, 36(1)
+; LE-32BIT-NEXT: subfic 12, 4, 32
+; LE-32BIT-NEXT: stw 11, 32(1)
+; LE-32BIT-NEXT: stw 10, 28(1)
+; LE-32BIT-NEXT: stw 9, 24(1)
+; LE-32BIT-NEXT: stw 8, 20(1)
+; LE-32BIT-NEXT: stw 7, 16(1)
+; LE-32BIT-NEXT: lwzux 3, 6, 3
+; LE-32BIT-NEXT: lwz 7, 8(6)
+; LE-32BIT-NEXT: slw 3, 3, 4
+; LE-32BIT-NEXT: lwz 8, 4(6)
+; LE-32BIT-NEXT: lwz 9, 16(6)
+; LE-32BIT-NEXT: srw 30, 7, 12
+; LE-32BIT-NEXT: lwz 10, 12(6)
+; LE-32BIT-NEXT: slw 29, 8, 4
+; LE-32BIT-NEXT: lwz 11, 24(6)
+; LE-32BIT-NEXT: srw 8, 8, 12
+; LE-32BIT-NEXT: lwz 0, 20(6)
+; LE-32BIT-NEXT: srw 28, 9, 12
+; LE-32BIT-NEXT: lwz 6, 28(6)
+; LE-32BIT-NEXT: slw 27, 10, 4
+; LE-32BIT-NEXT: srw 10, 10, 12
+; LE-32BIT-NEXT: slw 7, 7, 4
+; LE-32BIT-NEXT: srw 26, 11, 12
+; LE-32BIT-NEXT: slw 25, 0, 4
+; LE-32BIT-NEXT: srw 0, 0, 12
+; LE-32BIT-NEXT: slw 9, 9, 4
+; LE-32BIT-NEXT: srw 12, 6, 12
+; LE-32BIT-NEXT: slw 11, 11, 4
+; LE-32BIT-NEXT: slw 4, 6, 4
+; LE-32BIT-NEXT: stw 4, 28(5)
+; LE-32BIT-NEXT: or 4, 11, 12
+; LE-32BIT-NEXT: stw 4, 24(5)
+; LE-32BIT-NEXT: or 4, 9, 0
+; LE-32BIT-NEXT: stw 4, 16(5)
+; LE-32BIT-NEXT: or 4, 25, 26
+; LE-32BIT-NEXT: stw 4, 20(5)
+; LE-32BIT-NEXT: or 4, 7, 10
+; LE-32BIT-NEXT: or 3, 3, 8
+; LE-32BIT-NEXT: stw 4, 8(5)
+; LE-32BIT-NEXT: or 4, 27, 28
+; LE-32BIT-NEXT: stw 3, 0(5)
+; LE-32BIT-NEXT: or 3, 29, 30
+; LE-32BIT-NEXT: stw 4, 12(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: addi 1, 1, 112
+; LE-32BIT-NEXT: blr
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: shl_32bytes_wordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: li 6, 16
+; LE-64BIT-NEXT: lwz 4, 0(4)
+; LE-64BIT-NEXT: xxlxor 2, 2, 2
+; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: lxvd2x 1, 0, 3
+; LE-64BIT-NEXT: addi 8, 1, -32
+; LE-64BIT-NEXT: lxvd2x 0, 3, 6
+; LE-64BIT-NEXT: stxvd2x 2, 7, 6
+; LE-64BIT-NEXT: li 6, 48
+; LE-64BIT-NEXT: rlwinm 3, 4, 2, 27, 28
+; LE-64BIT-NEXT: rlwinm 4, 4, 5, 26, 26
+; LE-64BIT-NEXT: neg 3, 3
+; LE-64BIT-NEXT: stxvd2x 0, 7, 6
+; LE-64BIT-NEXT: li 6, 32
+; LE-64BIT-NEXT: extsw 3, 3
+; LE-64BIT-NEXT: stxvd2x 1, 7, 6
+; LE-64BIT-NEXT: stxvd2x 2, 0, 7
+; LE-64BIT-NEXT: subfic 6, 4, 64
+; LE-64BIT-NEXT: ldux 3, 8, 3
+; LE-64BIT-NEXT: ld 7, 16(8)
+; LE-64BIT-NEXT: ld 9, 24(8)
+; LE-64BIT-NEXT: ld 8, 8(8)
+; LE-64BIT-NEXT: srd 10, 7, 6
+; LE-64BIT-NEXT: sld 9, 9, 4
+; LE-64BIT-NEXT: sld 7, 7, 4
+; LE-64BIT-NEXT: or 9, 9, 10
+; LE-64BIT-NEXT: srd 10, 8, 6
+; LE-64BIT-NEXT: srd 6, 3, 6
+; LE-64BIT-NEXT: sld 8, 8, 4
+; LE-64BIT-NEXT: sld 3, 3, 4
+; LE-64BIT-NEXT: or 6, 8, 6
+; LE-64BIT-NEXT: std 3, 0(5)
+; LE-64BIT-NEXT: or 3, 7, 10
+; LE-64BIT-NEXT: std 9, 24(5)
+; LE-64BIT-NEXT: std 6, 8(5)
+; LE-64BIT-NEXT: std 3, 16(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: shl_32bytes_wordOff:
+; BE: # %bb.0:
+; BE-NEXT: ld 6, 0(3)
+; BE-NEXT: ld 7, 8(3)
+; BE-NEXT: ld 8, 16(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: lwz 4, 28(4)
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -64
+; BE-NEXT: std 9, -8(1)
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 9, -24(1)
+; BE-NEXT: std 9, -32(1)
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 8, -48(1)
+; BE-NEXT: std 7, -56(1)
+; BE-NEXT: std 6, -64(1)
+; BE-NEXT: rlwinm 3, 4, 2, 27, 28
+; BE-NEXT: ldux 6, 3, 10
+; BE-NEXT: rlwinm 4, 4, 5, 26, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 16(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: sld 6, 6, 4
+; BE-NEXT: srd 10, 7, 9
+; BE-NEXT: sld 11, 8, 4
+; BE-NEXT: srd 8, 8, 9
+; BE-NEXT: srd 9, 3, 9
+; BE-NEXT: sld 7, 7, 4
+; BE-NEXT: sld 3, 3, 4
+; BE-NEXT: or 10, 11, 10
+; BE-NEXT: or 6, 6, 8
+; BE-NEXT: or 7, 7, 9
+; BE-NEXT: std 3, 24(5)
+; BE-NEXT: std 7, 16(5)
+; BE-NEXT: std 6, 0(5)
+; BE-NEXT: std 10, 8(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: shl_32bytes_wordOff:
+; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -80(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
; LE-32BIT-NEXT: li 6, 0
@@ -595,7 +1376,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 4, 28(4)
; LE-32BIT-NEXT: stw 6, 76(1)
; LE-32BIT-NEXT: stw 6, 72(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 27
+; LE-32BIT-NEXT: rlwinm 4, 4, 2, 27, 29
; LE-32BIT-NEXT: stw 6, 68(1)
; LE-32BIT-NEXT: stw 6, 64(1)
; LE-32BIT-NEXT: stw 6, 60(1)
@@ -630,69 +1411,496 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: addi 1, 1, 80
; LE-32BIT-NEXT: blr
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: shl_32bytes_dwordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: li 6, 16
+; LE-64BIT-NEXT: lxvd2x 1, 0, 3
+; LE-64BIT-NEXT: xxlxor 2, 2, 2
+; LE-64BIT-NEXT: li 7, 48
+; LE-64BIT-NEXT: lxvd2x 0, 3, 6
+; LE-64BIT-NEXT: lwz 3, 0(4)
+; LE-64BIT-NEXT: addi 4, 1, -64
+; LE-64BIT-NEXT: stxvd2x 2, 4, 6
+; LE-64BIT-NEXT: rlwinm 3, 3, 3, 27, 28
+; LE-64BIT-NEXT: stxvd2x 0, 4, 7
+; LE-64BIT-NEXT: li 7, 32
+; LE-64BIT-NEXT: neg 3, 3
+; LE-64BIT-NEXT: stxvd2x 1, 4, 7
+; LE-64BIT-NEXT: stxvd2x 2, 0, 4
+; LE-64BIT-NEXT: extsw 3, 3
+; LE-64BIT-NEXT: addi 4, 1, -32
+; LE-64BIT-NEXT: lxvd2x 0, 4, 3
+; LE-64BIT-NEXT: add 3, 4, 3
+; LE-64BIT-NEXT: lxvd2x 1, 3, 6
+; LE-64BIT-NEXT: stxvd2x 1, 5, 6
+; LE-64BIT-NEXT: stxvd2x 0, 0, 5
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: shl_32bytes_dwordOff:
+; BE: # %bb.0:
+; BE-NEXT: ld 7, 0(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 9, 16(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: lwz 4, 28(4)
+; BE-NEXT: li 6, 0
+; BE-NEXT: std 6, -8(1)
+; BE-NEXT: std 6, -16(1)
+; BE-NEXT: std 6, -24(1)
+; BE-NEXT: std 6, -32(1)
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 9, -48(1)
+; BE-NEXT: std 8, -56(1)
+; BE-NEXT: std 7, -64(1)
+; BE-NEXT: rlwinm 3, 4, 3, 27, 28
+; BE-NEXT: addi 4, 1, -64
+; BE-NEXT: ldux 4, 3, 4
+; BE-NEXT: ld 6, 8(3)
+; BE-NEXT: ld 7, 24(3)
+; BE-NEXT: ld 3, 16(3)
+; BE-NEXT: std 4, 0(5)
+; BE-NEXT: std 3, 16(5)
+; BE-NEXT: std 7, 24(5)
+; BE-NEXT: std 6, 8(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: shl_32bytes_dwordOff:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -80(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: li 6, 0
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 10, 12(3)
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: lwz 12, 20(3)
+; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: lwz 4, 28(4)
+; LE-32BIT-NEXT: stw 6, 76(1)
+; LE-32BIT-NEXT: stw 6, 72(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 6, 68(1)
+; LE-32BIT-NEXT: stw 6, 64(1)
+; LE-32BIT-NEXT: stw 6, 60(1)
+; LE-32BIT-NEXT: stw 6, 56(1)
+; LE-32BIT-NEXT: stw 6, 52(1)
+; LE-32BIT-NEXT: stw 6, 48(1)
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: addi 3, 1, 16
+; LE-32BIT-NEXT: stw 0, 40(1)
+; LE-32BIT-NEXT: stw 12, 36(1)
+; LE-32BIT-NEXT: stw 11, 32(1)
+; LE-32BIT-NEXT: stw 10, 28(1)
+; LE-32BIT-NEXT: stw 9, 24(1)
+; LE-32BIT-NEXT: stw 8, 20(1)
+; LE-32BIT-NEXT: stw 7, 16(1)
+; LE-32BIT-NEXT: lwzux 3, 4, 3
+; LE-32BIT-NEXT: lwz 6, 12(4)
+; LE-32BIT-NEXT: lwz 7, 8(4)
+; LE-32BIT-NEXT: lwz 8, 20(4)
+; LE-32BIT-NEXT: lwz 9, 16(4)
+; LE-32BIT-NEXT: lwz 10, 28(4)
+; LE-32BIT-NEXT: lwz 11, 24(4)
+; LE-32BIT-NEXT: ori 4, 4, 4
+; LE-32BIT-NEXT: lwz 4, 0(4)
+; LE-32BIT-NEXT: stw 3, 0(5)
+; LE-32BIT-NEXT: stw 11, 24(5)
+; LE-32BIT-NEXT: stw 10, 28(5)
+; LE-32BIT-NEXT: stw 9, 16(5)
+; LE-32BIT-NEXT: stw 8, 20(5)
+; LE-32BIT-NEXT: stw 7, 8(5)
+; LE-32BIT-NEXT: stw 6, 12(5)
+; LE-32BIT-NEXT: stw 4, 4(5)
+; LE-32BIT-NEXT: addi 1, 1, 80
+; LE-32BIT-NEXT: blr
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
%res = shl i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
+
+
define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-LABEL: ashr_32bytes:
; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: ld 6, 24(3)
; LE-64BIT-NEXT: lxvd2x 0, 0, 3
-; LE-64BIT-NEXT: ld 6, 16(3)
-; LE-64BIT-NEXT: ld 3, 24(3)
+; LE-64BIT-NEXT: lwz 4, 0(4)
; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: ld 3, 16(3)
+; LE-64BIT-NEXT: sradi 8, 6, 63
+; LE-64BIT-NEXT: rlwinm 9, 4, 0, 27, 28
+; LE-64BIT-NEXT: stxvd2x 0, 0, 7
+; LE-64BIT-NEXT: std 6, -40(1)
+; LE-64BIT-NEXT: std 3, -48(1)
+; LE-64BIT-NEXT: std 8, -8(1)
+; LE-64BIT-NEXT: std 8, -16(1)
+; LE-64BIT-NEXT: std 8, -24(1)
+; LE-64BIT-NEXT: std 8, -32(1)
+; LE-64BIT-NEXT: rlwinm 3, 4, 3, 26, 28
+; LE-64BIT-NEXT: ldux 4, 9, 7
+; LE-64BIT-NEXT: ld 7, 8(9)
+; LE-64BIT-NEXT: subfic 6, 3, 64
+; LE-64BIT-NEXT: ld 8, 16(9)
+; LE-64BIT-NEXT: ld 9, 24(9)
+; LE-64BIT-NEXT: srd 4, 4, 3
+; LE-64BIT-NEXT: sld 10, 7, 6
+; LE-64BIT-NEXT: sld 11, 9, 6
+; LE-64BIT-NEXT: srd 7, 7, 3
+; LE-64BIT-NEXT: sld 6, 8, 6
+; LE-64BIT-NEXT: or 4, 10, 4
+; LE-64BIT-NEXT: srd 10, 8, 3
+; LE-64BIT-NEXT: srad 3, 9, 3
+; LE-64BIT-NEXT: or 6, 6, 7
+; LE-64BIT-NEXT: std 3, 24(5)
+; LE-64BIT-NEXT: or 3, 11, 10
+; LE-64BIT-NEXT: std 6, 8(5)
+; LE-64BIT-NEXT: std 4, 0(5)
+; LE-64BIT-NEXT: std 3, 16(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: ashr_32bytes:
+; BE: # %bb.0:
+; BE-NEXT: ld 7, 0(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 9, 16(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: lwz 4, 28(4)
+; BE-NEXT: addi 6, 1, -32
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: std 7, -32(1)
+; BE-NEXT: sradi 3, 7, 63
+; BE-NEXT: rlwinm 7, 4, 0, 27, 28
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 3, -48(1)
+; BE-NEXT: std 3, -56(1)
+; BE-NEXT: std 3, -64(1)
+; BE-NEXT: neg 3, 7
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 8, -24(1)
+; BE-NEXT: extsw 3, 3
+; BE-NEXT: ldux 3, 6, 3
+; BE-NEXT: rlwinm 4, 4, 3, 26, 28
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 8(6)
+; BE-NEXT: ld 8, 24(6)
+; BE-NEXT: ld 6, 16(6)
+; BE-NEXT: sld 10, 3, 9
+; BE-NEXT: srad 3, 3, 4
+; BE-NEXT: std 3, 0(5)
+; BE-NEXT: srd 11, 7, 4
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: sld 7, 7, 9
+; BE-NEXT: sld 9, 6, 9
+; BE-NEXT: srd 6, 6, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 8, 9, 8
+; BE-NEXT: or 6, 7, 6
+; BE-NEXT: std 6, 16(5)
+; BE-NEXT: std 8, 24(5)
+; BE-NEXT: std 10, 8(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: ashr_32bytes:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -112(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: addi 6, 1, 48
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 10, 12(3)
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: lwz 12, 20(3)
+; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: lwz 4, 28(4)
+; LE-32BIT-NEXT: stw 3, 76(1)
+; LE-32BIT-NEXT: srawi 3, 7, 31
+; LE-32BIT-NEXT: stw 7, 48(1)
+; LE-32BIT-NEXT: rlwinm 7, 4, 0, 27, 29
+; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 0, 72(1)
+; LE-32BIT-NEXT: subfic 0, 4, 32
+; LE-32BIT-NEXT: stw 12, 68(1)
+; LE-32BIT-NEXT: stw 11, 64(1)
+; LE-32BIT-NEXT: stw 10, 60(1)
+; LE-32BIT-NEXT: stw 9, 56(1)
+; LE-32BIT-NEXT: stw 8, 52(1)
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: stw 3, 40(1)
+; LE-32BIT-NEXT: stw 3, 36(1)
+; LE-32BIT-NEXT: stw 3, 32(1)
+; LE-32BIT-NEXT: stw 3, 28(1)
+; LE-32BIT-NEXT: stw 3, 24(1)
+; LE-32BIT-NEXT: stw 3, 20(1)
+; LE-32BIT-NEXT: stw 3, 16(1)
+; LE-32BIT-NEXT: sub 3, 6, 7
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 30, 6, 4
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: slw 29, 7, 0
+; LE-32BIT-NEXT: lwz 10, 20(3)
+; LE-32BIT-NEXT: srw 28, 8, 4
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: slw 27, 9, 0
+; LE-32BIT-NEXT: lwz 12, 28(3)
+; LE-32BIT-NEXT: slw 6, 6, 0
+; LE-32BIT-NEXT: lwz 3, 24(3)
+; LE-32BIT-NEXT: srw 26, 10, 4
+; LE-32BIT-NEXT: slw 25, 11, 0
+; LE-32BIT-NEXT: slw 8, 8, 0
+; LE-32BIT-NEXT: slw 10, 10, 0
+; LE-32BIT-NEXT: slw 0, 3, 0
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: srw 12, 12, 4
+; LE-32BIT-NEXT: or 3, 10, 3
+; LE-32BIT-NEXT: srw 11, 11, 4
+; LE-32BIT-NEXT: stw 3, 24(5)
+; LE-32BIT-NEXT: or 3, 0, 12
+; LE-32BIT-NEXT: stw 3, 28(5)
+; LE-32BIT-NEXT: or 3, 8, 11
+; LE-32BIT-NEXT: srw 9, 9, 4
+; LE-32BIT-NEXT: stw 3, 16(5)
+; LE-32BIT-NEXT: or 3, 25, 26
+; LE-32BIT-NEXT: stw 3, 20(5)
+; LE-32BIT-NEXT: or 3, 6, 9
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 27, 28
+; LE-32BIT-NEXT: sraw 4, 7, 4
+; LE-32BIT-NEXT: stw 3, 12(5)
+; LE-32BIT-NEXT: or 3, 29, 30
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: addi 1, 1, 112
+; LE-32BIT-NEXT: blr
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: ashr_32bytes_wordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: ld 6, 24(3)
+; LE-64BIT-NEXT: lxvd2x 0, 0, 3
; LE-64BIT-NEXT: lwz 4, 0(4)
-; LE-64BIT-NEXT: li 8, 16
-; LE-64BIT-NEXT: std 3, 24(7)
-; LE-64BIT-NEXT: sradi 3, 3, 63
-; LE-64BIT-NEXT: std 6, 16(7)
-; LE-64BIT-NEXT: std 3, 56(7)
-; LE-64BIT-NEXT: std 3, 48(7)
-; LE-64BIT-NEXT: std 3, 40(7)
-; LE-64BIT-NEXT: std 3, 32(7)
-; LE-64BIT-NEXT: clrldi 3, 4, 59
+; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: ld 3, 16(3)
+; LE-64BIT-NEXT: sradi 8, 6, 63
+; LE-64BIT-NEXT: rlwinm 9, 4, 2, 27, 28
; LE-64BIT-NEXT: stxvd2x 0, 0, 7
-; LE-64BIT-NEXT: lxvd2x 0, 7, 3
-; LE-64BIT-NEXT: add 3, 7, 3
-; LE-64BIT-NEXT: lxvd2x 1, 3, 8
-; LE-64BIT-NEXT: stxvd2x 1, 5, 8
+; LE-64BIT-NEXT: std 6, -40(1)
+; LE-64BIT-NEXT: std 3, -48(1)
+; LE-64BIT-NEXT: std 8, -8(1)
+; LE-64BIT-NEXT: std 8, -16(1)
+; LE-64BIT-NEXT: std 8, -24(1)
+; LE-64BIT-NEXT: std 8, -32(1)
+; LE-64BIT-NEXT: rlwinm 3, 4, 5, 26, 26
+; LE-64BIT-NEXT: ldux 4, 9, 7
+; LE-64BIT-NEXT: ld 7, 8(9)
+; LE-64BIT-NEXT: subfic 6, 3, 64
+; LE-64BIT-NEXT: ld 8, 16(9)
+; LE-64BIT-NEXT: ld 9, 24(9)
+; LE-64BIT-NEXT: srd 4, 4, 3
+; LE-64BIT-NEXT: sld 10, 7, 6
+; LE-64BIT-NEXT: sld 11, 9, 6
+; LE-64BIT-NEXT: srd 7, 7, 3
+; LE-64BIT-NEXT: sld 6, 8, 6
+; LE-64BIT-NEXT: or 4, 10, 4
+; LE-64BIT-NEXT: srd 10, 8, 3
+; LE-64BIT-NEXT: srad 3, 9, 3
+; LE-64BIT-NEXT: or 6, 6, 7
+; LE-64BIT-NEXT: std 3, 24(5)
+; LE-64BIT-NEXT: or 3, 11, 10
+; LE-64BIT-NEXT: std 6, 8(5)
+; LE-64BIT-NEXT: std 4, 0(5)
+; LE-64BIT-NEXT: std 3, 16(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: ashr_32bytes_wordOff:
+; BE: # %bb.0:
+; BE-NEXT: ld 7, 0(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 9, 16(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: lwz 4, 28(4)
+; BE-NEXT: addi 6, 1, -32
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: std 7, -32(1)
+; BE-NEXT: sradi 3, 7, 63
+; BE-NEXT: rlwinm 7, 4, 2, 27, 28
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 3, -48(1)
+; BE-NEXT: std 3, -56(1)
+; BE-NEXT: std 3, -64(1)
+; BE-NEXT: neg 3, 7
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 8, -24(1)
+; BE-NEXT: extsw 3, 3
+; BE-NEXT: ldux 3, 6, 3
+; BE-NEXT: rlwinm 4, 4, 5, 26, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 8(6)
+; BE-NEXT: ld 8, 24(6)
+; BE-NEXT: ld 6, 16(6)
+; BE-NEXT: sld 10, 3, 9
+; BE-NEXT: srad 3, 3, 4
+; BE-NEXT: std 3, 0(5)
+; BE-NEXT: srd 11, 7, 4
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: sld 7, 7, 9
+; BE-NEXT: sld 9, 6, 9
+; BE-NEXT: srd 6, 6, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 8, 9, 8
+; BE-NEXT: or 6, 7, 6
+; BE-NEXT: std 6, 16(5)
+; BE-NEXT: std 8, 24(5)
+; BE-NEXT: std 10, 8(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: ashr_32bytes_wordOff:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -80(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: addi 6, 1, 48
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 10, 12(3)
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: lwz 12, 20(3)
+; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: lwz 4, 28(4)
+; LE-32BIT-NEXT: stw 3, 76(1)
+; LE-32BIT-NEXT: srawi 3, 7, 31
+; LE-32BIT-NEXT: rlwinm 4, 4, 2, 27, 29
+; LE-32BIT-NEXT: stw 0, 72(1)
+; LE-32BIT-NEXT: stw 12, 68(1)
+; LE-32BIT-NEXT: stw 11, 64(1)
+; LE-32BIT-NEXT: stw 10, 60(1)
+; LE-32BIT-NEXT: stw 9, 56(1)
+; LE-32BIT-NEXT: stw 8, 52(1)
+; LE-32BIT-NEXT: stw 7, 48(1)
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: stw 3, 40(1)
+; LE-32BIT-NEXT: stw 3, 36(1)
+; LE-32BIT-NEXT: stw 3, 32(1)
+; LE-32BIT-NEXT: stw 3, 28(1)
+; LE-32BIT-NEXT: stw 3, 24(1)
+; LE-32BIT-NEXT: stw 3, 20(1)
+; LE-32BIT-NEXT: stw 3, 16(1)
+; LE-32BIT-NEXT: sub 3, 6, 4
+; LE-32BIT-NEXT: lwz 4, 4(3)
+; LE-32BIT-NEXT: lwz 6, 0(3)
+; LE-32BIT-NEXT: lwz 7, 12(3)
+; LE-32BIT-NEXT: lwz 8, 8(3)
+; LE-32BIT-NEXT: lwz 9, 20(3)
+; LE-32BIT-NEXT: lwz 10, 16(3)
+; LE-32BIT-NEXT: lwz 11, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: stw 11, 24(5)
+; LE-32BIT-NEXT: stw 3, 28(5)
+; LE-32BIT-NEXT: stw 10, 16(5)
+; LE-32BIT-NEXT: stw 9, 20(5)
+; LE-32BIT-NEXT: stw 8, 8(5)
+; LE-32BIT-NEXT: stw 7, 12(5)
+; LE-32BIT-NEXT: stw 6, 0(5)
+; LE-32BIT-NEXT: stw 4, 4(5)
+; LE-32BIT-NEXT: addi 1, 1, 80
+; LE-32BIT-NEXT: blr
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: ashr_32bytes_dwordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: lxvd2x 0, 0, 3
+; LE-64BIT-NEXT: ld 6, 16(3)
+; LE-64BIT-NEXT: ld 7, 24(3)
+; LE-64BIT-NEXT: lwz 3, 0(4)
+; LE-64BIT-NEXT: addi 4, 1, -64
+; LE-64BIT-NEXT: rlwinm 3, 3, 3, 27, 28
+; LE-64BIT-NEXT: stxvd2x 0, 0, 4
+; LE-64BIT-NEXT: std 6, -48(1)
+; LE-64BIT-NEXT: sradi 6, 7, 63
+; LE-64BIT-NEXT: std 7, -40(1)
+; LE-64BIT-NEXT: std 6, -8(1)
+; LE-64BIT-NEXT: std 6, -16(1)
+; LE-64BIT-NEXT: std 6, -24(1)
+; LE-64BIT-NEXT: std 6, -32(1)
+; LE-64BIT-NEXT: lxvd2x 0, 4, 3
+; LE-64BIT-NEXT: add 3, 4, 3
+; LE-64BIT-NEXT: li 4, 16
+; LE-64BIT-NEXT: lxvd2x 1, 3, 4
+; LE-64BIT-NEXT: stxvd2x 1, 5, 4
; LE-64BIT-NEXT: stxvd2x 0, 0, 5
; LE-64BIT-NEXT: blr
;
-; BE-LABEL: ashr_32bytes:
+; BE-LABEL: ashr_32bytes_dwordOff:
; BE: # %bb.0:
; BE-NEXT: ld 7, 0(3)
; BE-NEXT: ld 8, 8(3)
; BE-NEXT: ld 9, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 6, 1, -64
-; BE-NEXT: std 3, 56(6)
+; BE-NEXT: addi 6, 1, -32
+; BE-NEXT: std 3, -8(1)
; BE-NEXT: sradi 3, 7, 63
-; BE-NEXT: clrlwi 4, 4, 27
-; BE-NEXT: std 3, 24(6)
-; BE-NEXT: std 3, 16(6)
-; BE-NEXT: std 3, 8(6)
+; BE-NEXT: rlwinm 4, 4, 3, 27, 28
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 3, -48(1)
+; BE-NEXT: std 3, -56(1)
; BE-NEXT: std 3, -64(1)
; BE-NEXT: neg 3, 4
-; BE-NEXT: std 9, 48(6)
-; BE-NEXT: std 8, 40(6)
-; BE-NEXT: std 7, 32(6)
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 8, -24(1)
+; BE-NEXT: std 7, -32(1)
; BE-NEXT: extsw 3, 3
-; BE-NEXT: addi 4, 1, -32
-; BE-NEXT: ldux 3, 4, 3
-; BE-NEXT: ld 6, 8(4)
-; BE-NEXT: ld 7, 24(4)
-; BE-NEXT: ld 4, 16(4)
+; BE-NEXT: ldux 3, 6, 3
+; BE-NEXT: ld 4, 8(6)
+; BE-NEXT: ld 7, 24(6)
+; BE-NEXT: ld 6, 16(6)
; BE-NEXT: std 3, 0(5)
-; BE-NEXT: std 4, 16(5)
+; BE-NEXT: std 6, 16(5)
; BE-NEXT: std 7, 24(5)
-; BE-NEXT: std 6, 8(5)
+; BE-NEXT: std 4, 8(5)
; BE-NEXT: blr
;
-; LE-32BIT-LABEL: ashr_32bytes:
+; LE-32BIT-LABEL: ashr_32bytes_dwordOff:
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -80(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
@@ -707,7 +1915,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 4, 28(4)
; LE-32BIT-NEXT: stw 3, 76(1)
; LE-32BIT-NEXT: srawi 3, 7, 31
-; LE-32BIT-NEXT: clrlwi 4, 4, 27
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
; LE-32BIT-NEXT: stw 0, 72(1)
; LE-32BIT-NEXT: stw 12, 68(1)
; LE-32BIT-NEXT: stw 11, 64(1)
@@ -743,11 +1951,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: addi 1, 1, 80
; LE-32BIT-NEXT: blr
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
%res = ashr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
+
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; LE: {{.*}}
diff --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
index 044ddf562294c8..8e69547df6fcc1 100644
--- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
@@ -209,45 +209,41 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stwu 1, -48(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
; LE-32BIT-NEXT: li 6, 0
-; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: lwz 8, 4(3)
; LE-32BIT-NEXT: lwz 9, 8(3)
; LE-32BIT-NEXT: lwz 3, 12(3)
+; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: stw 6, 28(1)
; LE-32BIT-NEXT: stw 6, 24(1)
; LE-32BIT-NEXT: stw 6, 20(1)
; LE-32BIT-NEXT: stw 6, 16(1)
-; LE-32BIT-NEXT: addi 6, 1, 32
-; LE-32BIT-NEXT: stw 7, 32(1)
-; LE-32BIT-NEXT: rlwinm 7, 4, 29, 28, 31
+; LE-32BIT-NEXT: rlwinm 6, 4, 29, 28, 29
; LE-32BIT-NEXT: stw 3, 44(1)
-; LE-32BIT-NEXT: sub 6, 6, 7
+; LE-32BIT-NEXT: addi 3, 1, 32
; LE-32BIT-NEXT: stw 9, 40(1)
-; LE-32BIT-NEXT: li 3, 7
+; LE-32BIT-NEXT: sub 3, 3, 6
; LE-32BIT-NEXT: stw 8, 36(1)
-; LE-32BIT-NEXT: nand 3, 4, 3
-; LE-32BIT-NEXT: lwz 7, 4(6)
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
-; LE-32BIT-NEXT: lwz 8, 8(6)
-; LE-32BIT-NEXT: subfic 10, 4, 32
-; LE-32BIT-NEXT: lwz 9, 0(6)
-; LE-32BIT-NEXT: clrlwi 3, 3, 27
-; LE-32BIT-NEXT: lwz 6, 12(6)
-; LE-32BIT-NEXT: srw 11, 8, 4
-; LE-32BIT-NEXT: slw 8, 8, 10
-; LE-32BIT-NEXT: slw 10, 9, 10
-; LE-32BIT-NEXT: srw 6, 6, 4
-; LE-32BIT-NEXT: srw 9, 9, 4
-; LE-32BIT-NEXT: srw 4, 7, 4
-; LE-32BIT-NEXT: slwi 7, 7, 1
-; LE-32BIT-NEXT: slw 3, 7, 3
-; LE-32BIT-NEXT: or 6, 8, 6
-; LE-32BIT-NEXT: or 4, 10, 4
-; LE-32BIT-NEXT: or 3, 11, 3
-; LE-32BIT-NEXT: stw 9, 0(5)
-; LE-32BIT-NEXT: stw 6, 12(5)
-; LE-32BIT-NEXT: stw 4, 4(5)
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
+; LE-32BIT-NEXT: stw 7, 32(1)
+; LE-32BIT-NEXT: subfic 9, 4, 32
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 10, 6, 4
+; LE-32BIT-NEXT: lwz 3, 8(3)
+; LE-32BIT-NEXT: slw 11, 7, 9
+; LE-32BIT-NEXT: slw 6, 6, 9
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 9, 3, 9
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: or 3, 6, 3
; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 9, 8
+; LE-32BIT-NEXT: srw 4, 7, 4
+; LE-32BIT-NEXT: stw 3, 12(5)
+; LE-32BIT-NEXT: or 3, 11, 10
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
@@ -304,34 +300,30 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 6, 40(1)
; LE-32BIT-NEXT: stw 6, 36(1)
; LE-32BIT-NEXT: stw 6, 32(1)
-; LE-32BIT-NEXT: rlwinm 6, 4, 29, 28, 31
+; LE-32BIT-NEXT: rlwinm 6, 4, 29, 28, 29
; LE-32BIT-NEXT: stw 3, 28(1)
; LE-32BIT-NEXT: addi 3, 1, 16
; LE-32BIT-NEXT: stw 9, 24(1)
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
; LE-32BIT-NEXT: stw 8, 20(1)
+; LE-32BIT-NEXT: subfic 8, 4, 32
; LE-32BIT-NEXT: stw 7, 16(1)
-; LE-32BIT-NEXT: li 7, 7
; LE-32BIT-NEXT: lwzux 3, 6, 3
-; LE-32BIT-NEXT: nand 7, 4, 7
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
-; LE-32BIT-NEXT: subfic 10, 4, 32
-; LE-32BIT-NEXT: lwz 8, 8(6)
-; LE-32BIT-NEXT: clrlwi 7, 7, 27
; LE-32BIT-NEXT: lwz 9, 4(6)
; LE-32BIT-NEXT: slw 3, 3, 4
+; LE-32BIT-NEXT: lwz 7, 8(6)
; LE-32BIT-NEXT: lwz 6, 12(6)
; LE-32BIT-NEXT: slw 11, 9, 4
-; LE-32BIT-NEXT: srw 9, 9, 10
-; LE-32BIT-NEXT: srw 10, 6, 10
-; LE-32BIT-NEXT: slw 6, 6, 4
-; LE-32BIT-NEXT: slw 4, 8, 4
-; LE-32BIT-NEXT: srwi 8, 8, 1
-; LE-32BIT-NEXT: srw 7, 8, 7
+; LE-32BIT-NEXT: srw 9, 9, 8
+; LE-32BIT-NEXT: srw 10, 7, 8
+; LE-32BIT-NEXT: srw 8, 6, 8
+; LE-32BIT-NEXT: slw 7, 7, 4
+; LE-32BIT-NEXT: slw 4, 6, 4
; LE-32BIT-NEXT: or 3, 3, 9
-; LE-32BIT-NEXT: or 4, 4, 10
+; LE-32BIT-NEXT: stw 4, 12(5)
+; LE-32BIT-NEXT: or 4, 7, 8
; LE-32BIT-NEXT: stw 3, 0(5)
-; LE-32BIT-NEXT: or 3, 11, 7
-; LE-32BIT-NEXT: stw 6, 12(5)
+; LE-32BIT-NEXT: or 3, 11, 10
; LE-32BIT-NEXT: stw 4, 8(5)
; LE-32BIT-NEXT: stw 3, 4(5)
; LE-32BIT-NEXT: addi 1, 1, 48
@@ -387,46 +379,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -48(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
-; LE-32BIT-NEXT: li 6, 7
+; LE-32BIT-NEXT: addi 6, 1, 32
; LE-32BIT-NEXT: lwz 8, 4(3)
; LE-32BIT-NEXT: lwz 9, 8(3)
; LE-32BIT-NEXT: lwz 3, 12(3)
; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: stw 3, 44(1)
; LE-32BIT-NEXT: srawi 3, 7, 31
-; LE-32BIT-NEXT: stw 8, 36(1)
-; LE-32BIT-NEXT: rlwinm 8, 4, 29, 28, 31
; LE-32BIT-NEXT: stw 7, 32(1)
-; LE-32BIT-NEXT: addi 7, 1, 32
+; LE-32BIT-NEXT: rlwinm 7, 4, 29, 28, 29
; LE-32BIT-NEXT: stw 9, 40(1)
-; LE-32BIT-NEXT: nand 6, 4, 6
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
+; LE-32BIT-NEXT: stw 8, 36(1)
+; LE-32BIT-NEXT: subfic 9, 4, 32
; LE-32BIT-NEXT: stw 3, 28(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
; LE-32BIT-NEXT: stw 3, 24(1)
-; LE-32BIT-NEXT: subfic 10, 4, 32
; LE-32BIT-NEXT: stw 3, 20(1)
-; LE-32BIT-NEXT: clrlwi 6, 6, 27
; LE-32BIT-NEXT: stw 3, 16(1)
-; LE-32BIT-NEXT: sub 3, 7, 8
-; LE-32BIT-NEXT: lwz 7, 4(3)
-; LE-32BIT-NEXT: lwz 8, 8(3)
-; LE-32BIT-NEXT: lwz 9, 0(3)
-; LE-32BIT-NEXT: lwz 3, 12(3)
-; LE-32BIT-NEXT: srw 11, 8, 4
-; LE-32BIT-NEXT: slw 8, 8, 10
-; LE-32BIT-NEXT: slw 10, 9, 10
+; LE-32BIT-NEXT: sub 3, 6, 7
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 10, 6, 4
+; LE-32BIT-NEXT: lwz 3, 8(3)
+; LE-32BIT-NEXT: slw 11, 7, 9
+; LE-32BIT-NEXT: slw 6, 6, 9
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 9, 3, 9
; LE-32BIT-NEXT: srw 3, 3, 4
-; LE-32BIT-NEXT: sraw 9, 9, 4
-; LE-32BIT-NEXT: srw 4, 7, 4
-; LE-32BIT-NEXT: slwi 7, 7, 1
-; LE-32BIT-NEXT: or 3, 8, 3
-; LE-32BIT-NEXT: slw 6, 7, 6
+; LE-32BIT-NEXT: or 3, 6, 3
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 9, 8
+; LE-32BIT-NEXT: sraw 4, 7, 4
; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: or 3, 10, 4
+; LE-32BIT-NEXT: or 3, 11, 10
+; LE-32BIT-NEXT: stw 4, 0(5)
; LE-32BIT-NEXT: stw 3, 4(5)
-; LE-32BIT-NEXT: or 3, 11, 6
-; LE-32BIT-NEXT: stw 9, 0(5)
-; LE-32BIT-NEXT: stw 3, 8(5)
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
@@ -449,32 +437,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-64BIT-NEXT: li 4, 48
; LE-64BIT-NEXT: stxvd2x 2, 7, 4
; LE-64BIT-NEXT: stxvd2x 2, 7, 8
-; LE-64BIT-NEXT: rlwinm 4, 3, 29, 27, 31
+; LE-64BIT-NEXT: rlwinm 4, 3, 29, 27, 28
+; LE-64BIT-NEXT: clrlwi 3, 3, 26
; LE-64BIT-NEXT: stxvd2x 0, 7, 6
; LE-64BIT-NEXT: stxvd2x 1, 0, 7
-; LE-64BIT-NEXT: li 6, 7
-; LE-64BIT-NEXT: ldux 7, 4, 7
-; LE-64BIT-NEXT: ld 8, 16(4)
-; LE-64BIT-NEXT: nand 6, 3, 6
+; LE-64BIT-NEXT: xori 8, 3, 63
+; LE-64BIT-NEXT: ldux 6, 4, 7
+; LE-64BIT-NEXT: ld 7, 16(4)
; LE-64BIT-NEXT: ld 9, 8(4)
-; LE-64BIT-NEXT: clrlwi 3, 3, 29
; LE-64BIT-NEXT: ld 4, 24(4)
-; LE-64BIT-NEXT: clrlwi 6, 6, 26
+; LE-64BIT-NEXT: srd 6, 6, 3
+; LE-64BIT-NEXT: sldi 11, 7, 1
+; LE-64BIT-NEXT: srd 10, 9, 3
; LE-64BIT-NEXT: srd 7, 7, 3
-; LE-64BIT-NEXT: sldi 10, 8, 1
-; LE-64BIT-NEXT: srd 11, 9, 3
-; LE-64BIT-NEXT: srd 8, 8, 3
-; LE-64BIT-NEXT: sld 6, 10, 6
+; LE-64BIT-NEXT: sld 8, 11, 8
+; LE-64BIT-NEXT: or 8, 10, 8
; LE-64BIT-NEXT: subfic 10, 3, 64
; LE-64BIT-NEXT: srd 3, 4, 3
-; LE-64BIT-NEXT: or 6, 11, 6
; LE-64BIT-NEXT: sld 11, 4, 10
; LE-64BIT-NEXT: sld 9, 9, 10
; LE-64BIT-NEXT: std 3, 24(5)
-; LE-64BIT-NEXT: or 7, 9, 7
-; LE-64BIT-NEXT: or 3, 11, 8
-; LE-64BIT-NEXT: std 6, 8(5)
-; LE-64BIT-NEXT: std 7, 0(5)
+; LE-64BIT-NEXT: std 8, 8(5)
+; LE-64BIT-NEXT: or 6, 9, 6
+; LE-64BIT-NEXT: or 3, 11, 7
+; LE-64BIT-NEXT: std 6, 0(5)
; LE-64BIT-NEXT: std 3, 16(5)
; LE-64BIT-NEXT: blr
;
@@ -485,44 +471,39 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; BE-NEXT: ld 8, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: li 10, 0
-; BE-NEXT: addi 11, 1, -32
-; BE-NEXT: std 3, 56(9)
-; BE-NEXT: rlwinm 3, 4, 29, 27, 31
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -32
+; BE-NEXT: std 9, -40(1)
+; BE-NEXT: std 9, -48(1)
+; BE-NEXT: std 9, -56(1)
+; BE-NEXT: std 9, -64(1)
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: rlwinm 3, 4, 29, 27, 28
; BE-NEXT: neg 3, 3
-; BE-NEXT: std 10, 24(9)
-; BE-NEXT: std 10, 16(9)
-; BE-NEXT: std 10, 8(9)
-; BE-NEXT: std 10, -64(1)
-; BE-NEXT: std 8, 48(9)
-; BE-NEXT: std 7, 40(9)
-; BE-NEXT: std 6, 32(9)
+; BE-NEXT: std 8, -16(1)
+; BE-NEXT: std 7, -24(1)
+; BE-NEXT: std 6, -32(1)
; BE-NEXT: extsw 3, 3
-; BE-NEXT: ldux 3, 11, 3
-; BE-NEXT: li 6, 7
-; BE-NEXT: nand 6, 4, 6
-; BE-NEXT: clrlwi 4, 4, 29
-; BE-NEXT: clrlwi 6, 6, 26
-; BE-NEXT: ld 7, 8(11)
-; BE-NEXT: ld 8, 16(11)
-; BE-NEXT: ld 9, 24(11)
-; BE-NEXT: subfic 10, 4, 64
-; BE-NEXT: sldi 11, 7, 1
-; BE-NEXT: srd 7, 7, 4
-; BE-NEXT: srd 9, 9, 4
-; BE-NEXT: sld 6, 11, 6
-; BE-NEXT: sld 11, 3, 10
-; BE-NEXT: sld 10, 8, 10
-; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: ldux 3, 10, 3
+; BE-NEXT: clrlwi 4, 4, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 6, 8(10)
+; BE-NEXT: ld 7, 24(10)
+; BE-NEXT: ld 8, 16(10)
+; BE-NEXT: sld 10, 3, 9
; BE-NEXT: srd 3, 3, 4
-; BE-NEXT: or 7, 11, 7
-; BE-NEXT: or 6, 8, 6
-; BE-NEXT: or 8, 10, 9
; BE-NEXT: std 3, 0(5)
-; BE-NEXT: std 8, 24(5)
-; BE-NEXT: std 7, 8(5)
+; BE-NEXT: srd 11, 6, 4
+; BE-NEXT: srd 7, 7, 4
+; BE-NEXT: sld 6, 6, 9
+; BE-NEXT: sld 9, 8, 9
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 7, 9, 7
+; BE-NEXT: or 6, 6, 8
; BE-NEXT: std 6, 16(5)
+; BE-NEXT: std 7, 24(5)
+; BE-NEXT: std 10, 8(5)
; BE-NEXT: blr
;
; LE-32BIT-LABEL: lshr_32bytes:
@@ -538,7 +519,6 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 0, 24(3)
; LE-32BIT-NEXT: lwz 3, 28(3)
; LE-32BIT-NEXT: lwz 4, 28(4)
-; LE-32BIT-NEXT: stw 6, 48(1)
; LE-32BIT-NEXT: stw 6, 44(1)
; LE-32BIT-NEXT: stw 6, 40(1)
; LE-32BIT-NEXT: stw 6, 36(1)
@@ -546,68 +526,65 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 6, 28(1)
; LE-32BIT-NEXT: stw 6, 24(1)
; LE-32BIT-NEXT: stw 6, 20(1)
-; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 31
-; LE-32BIT-NEXT: stw 3, 80(1)
-; LE-32BIT-NEXT: addi 3, 1, 52
+; LE-32BIT-NEXT: stw 6, 16(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 29
+; LE-32BIT-NEXT: stw 3, 76(1)
+; LE-32BIT-NEXT: addi 3, 1, 48
; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: sub 3, 3, 6
; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT: stw 0, 76(1)
-; LE-32BIT-NEXT: stw 12, 72(1)
-; LE-32BIT-NEXT: stw 11, 68(1)
-; LE-32BIT-NEXT: stw 10, 64(1)
-; LE-32BIT-NEXT: stw 9, 60(1)
-; LE-32BIT-NEXT: li 9, 7
-; LE-32BIT-NEXT: stw 8, 56(1)
-; LE-32BIT-NEXT: nand 9, 4, 9
-; LE-32BIT-NEXT: stw 7, 52(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
-; LE-32BIT-NEXT: lwz 6, 4(3)
; LE-32BIT-NEXT: subfic 30, 4, 32
-; LE-32BIT-NEXT: lwz 7, 8(3)
-; LE-32BIT-NEXT: clrlwi 9, 9, 27
-; LE-32BIT-NEXT: lwz 8, 12(3)
-; LE-32BIT-NEXT: slwi 29, 6, 1
-; LE-32BIT-NEXT: lwz 10, 16(3)
-; LE-32BIT-NEXT: srw 28, 7, 4
-; LE-32BIT-NEXT: lwz 11, 20(3)
-; LE-32BIT-NEXT: slwi 27, 8, 1
-; LE-32BIT-NEXT: lwz 12, 24(3)
+; LE-32BIT-NEXT: stw 0, 72(1)
+; LE-32BIT-NEXT: stw 12, 68(1)
+; LE-32BIT-NEXT: xori 12, 4, 31
+; LE-32BIT-NEXT: stw 11, 64(1)
+; LE-32BIT-NEXT: stw 10, 60(1)
+; LE-32BIT-NEXT: stw 9, 56(1)
+; LE-32BIT-NEXT: stw 8, 52(1)
+; LE-32BIT-NEXT: stw 7, 48(1)
+; LE-32BIT-NEXT: lwz 6, 8(3)
+; LE-32BIT-NEXT: lwz 7, 4(3)
+; LE-32BIT-NEXT: lwz 8, 0(3)
+; LE-32BIT-NEXT: srw 29, 6, 4
+; LE-32BIT-NEXT: lwz 9, 12(3)
+; LE-32BIT-NEXT: slw 6, 6, 30
+; LE-32BIT-NEXT: lwz 10, 20(3)
+; LE-32BIT-NEXT: slw 28, 8, 30
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: srw 27, 9, 4
+; LE-32BIT-NEXT: lwz 0, 28(3)
; LE-32BIT-NEXT: srw 26, 10, 4
-; LE-32BIT-NEXT: lwz 0, 0(3)
-; LE-32BIT-NEXT: srw 6, 6, 4
-; LE-32BIT-NEXT: lwz 3, 28(3)
-; LE-32BIT-NEXT: srw 25, 12, 4
-; LE-32BIT-NEXT: slw 12, 12, 30
-; LE-32BIT-NEXT: slw 7, 7, 30
-; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: lwz 3, 24(3)
+; LE-32BIT-NEXT: slw 25, 11, 30
+; LE-32BIT-NEXT: slw 9, 9, 30
; LE-32BIT-NEXT: slw 10, 10, 30
-; LE-32BIT-NEXT: slw 30, 0, 30
-; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 30, 3, 30
+; LE-32BIT-NEXT: srw 3, 3, 4
; LE-32BIT-NEXT: srw 0, 0, 4
-; LE-32BIT-NEXT: srw 4, 11, 4
-; LE-32BIT-NEXT: or 3, 12, 3
+; LE-32BIT-NEXT: or 3, 10, 3
+; LE-32BIT-NEXT: srw 11, 11, 4
+; LE-32BIT-NEXT: stw 3, 24(5)
+; LE-32BIT-NEXT: or 3, 30, 0
; LE-32BIT-NEXT: stw 3, 28(5)
-; LE-32BIT-NEXT: or 3, 10, 4
-; LE-32BIT-NEXT: slwi 11, 11, 1
+; LE-32BIT-NEXT: or 3, 9, 11
+; LE-32BIT-NEXT: stw 3, 16(5)
+; LE-32BIT-NEXT: or 3, 25, 26
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: srw 4, 7, 4
+; LE-32BIT-NEXT: slwi 7, 7, 1
; LE-32BIT-NEXT: stw 3, 20(5)
-; LE-32BIT-NEXT: or 3, 7, 8
-; LE-32BIT-NEXT: slw 29, 29, 9
-; LE-32BIT-NEXT: slw 27, 27, 9
-; LE-32BIT-NEXT: slw 9, 11, 9
+; LE-32BIT-NEXT: or 3, 6, 27
+; LE-32BIT-NEXT: slw 7, 7, 12
; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: or 3, 30, 6
+; LE-32BIT-NEXT: or 3, 28, 4
; LE-32BIT-NEXT: stw 3, 4(5)
-; LE-32BIT-NEXT: or 3, 25, 9
-; LE-32BIT-NEXT: stw 3, 24(5)
-; LE-32BIT-NEXT: or 3, 26, 27
-; LE-32BIT-NEXT: stw 3, 16(5)
-; LE-32BIT-NEXT: or 3, 28, 29
-; LE-32BIT-NEXT: stw 0, 0(5)
+; LE-32BIT-NEXT: or 3, 29, 7
+; LE-32BIT-NEXT: stw 8, 0(5)
; LE-32BIT-NEXT: stw 3, 8(5)
; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
@@ -635,37 +612,33 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-64BIT-NEXT: lxvd2x 0, 3, 6
; LE-64BIT-NEXT: stxvd2x 2, 7, 6
; LE-64BIT-NEXT: li 6, 48
-; LE-64BIT-NEXT: rlwinm 3, 4, 29, 27, 31
+; LE-64BIT-NEXT: rlwinm 3, 4, 29, 27, 28
+; LE-64BIT-NEXT: clrlwi 4, 4, 26
; LE-64BIT-NEXT: neg 3, 3
; LE-64BIT-NEXT: stxvd2x 0, 7, 6
; LE-64BIT-NEXT: li 6, 32
; LE-64BIT-NEXT: extsw 3, 3
; LE-64BIT-NEXT: stxvd2x 1, 7, 6
; LE-64BIT-NEXT: stxvd2x 2, 0, 7
-; LE-64BIT-NEXT: li 6, 7
+; LE-64BIT-NEXT: subfic 6, 4, 64
; LE-64BIT-NEXT: ldux 3, 8, 3
-; LE-64BIT-NEXT: ld 7, 8(8)
-; LE-64BIT-NEXT: nand 6, 4, 6
-; LE-64BIT-NEXT: ld 9, 16(8)
-; LE-64BIT-NEXT: clrlwi 4, 4, 29
-; LE-64BIT-NEXT: ld 8, 24(8)
-; LE-64BIT-NEXT: clrlwi 6, 6, 26
-; LE-64BIT-NEXT: rldicl 10, 7, 63, 1
-; LE-64BIT-NEXT: sld 8, 8, 4
+; LE-64BIT-NEXT: ld 7, 16(8)
+; LE-64BIT-NEXT: ld 9, 24(8)
+; LE-64BIT-NEXT: ld 8, 8(8)
+; LE-64BIT-NEXT: srd 10, 7, 6
+; LE-64BIT-NEXT: sld 9, 9, 4
; LE-64BIT-NEXT: sld 7, 7, 4
-; LE-64BIT-NEXT: srd 6, 10, 6
-; LE-64BIT-NEXT: sld 10, 9, 4
-; LE-64BIT-NEXT: or 6, 10, 6
-; LE-64BIT-NEXT: subfic 10, 4, 64
-; LE-64BIT-NEXT: srd 9, 9, 10
-; LE-64BIT-NEXT: srd 10, 3, 10
+; LE-64BIT-NEXT: or 9, 9, 10
+; LE-64BIT-NEXT: srd 10, 8, 6
+; LE-64BIT-NEXT: srd 6, 3, 6
+; LE-64BIT-NEXT: sld 8, 8, 4
; LE-64BIT-NEXT: sld 3, 3, 4
-; LE-64BIT-NEXT: std 6, 16(5)
-; LE-64BIT-NEXT: or 7, 7, 10
+; LE-64BIT-NEXT: or 6, 8, 6
; LE-64BIT-NEXT: std 3, 0(5)
-; LE-64BIT-NEXT: or 3, 8, 9
-; LE-64BIT-NEXT: std 7, 8(5)
-; LE-64BIT-NEXT: std 3, 24(5)
+; LE-64BIT-NEXT: or 3, 7, 10
+; LE-64BIT-NEXT: std 9, 24(5)
+; LE-64BIT-NEXT: std 6, 8(5)
+; LE-64BIT-NEXT: std 3, 16(5)
; LE-64BIT-NEXT: blr
;
; BE-LABEL: shl_32bytes:
@@ -675,41 +648,37 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; BE-NEXT: ld 8, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: li 10, 0
-; BE-NEXT: std 10, 56(9)
-; BE-NEXT: std 10, 48(9)
-; BE-NEXT: std 10, 40(9)
-; BE-NEXT: std 10, 32(9)
-; BE-NEXT: std 3, 24(9)
-; BE-NEXT: std 8, 16(9)
-; BE-NEXT: std 7, 8(9)
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -64
+; BE-NEXT: std 9, -8(1)
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 9, -24(1)
+; BE-NEXT: std 9, -32(1)
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 8, -48(1)
+; BE-NEXT: std 7, -56(1)
; BE-NEXT: std 6, -64(1)
-; BE-NEXT: rlwinm 3, 4, 29, 27, 31
-; BE-NEXT: ldux 6, 3, 9
-; BE-NEXT: li 7, 7
-; BE-NEXT: nand 7, 4, 7
-; BE-NEXT: clrlwi 4, 4, 29
-; BE-NEXT: clrlwi 7, 7, 26
-; BE-NEXT: ld 8, 16(3)
-; BE-NEXT: ld 9, 8(3)
+; BE-NEXT: rlwinm 3, 4, 29, 27, 28
+; BE-NEXT: ldux 6, 3, 10
+; BE-NEXT: clrlwi 4, 4, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 16(3)
+; BE-NEXT: ld 8, 8(3)
; BE-NEXT: ld 3, 24(3)
-; BE-NEXT: subfic 10, 4, 64
; BE-NEXT: sld 6, 6, 4
-; BE-NEXT: rldicl 11, 8, 63, 1
-; BE-NEXT: sld 8, 8, 4
-; BE-NEXT: srd 7, 11, 7
-; BE-NEXT: srd 11, 9, 10
-; BE-NEXT: sld 9, 9, 4
-; BE-NEXT: srd 10, 3, 10
+; BE-NEXT: srd 10, 7, 9
+; BE-NEXT: sld 11, 8, 4
+; BE-NEXT: srd 8, 8, 9
+; BE-NEXT: srd 9, 3, 9
+; BE-NEXT: sld 7, 7, 4
; BE-NEXT: sld 3, 3, 4
-; BE-NEXT: or 6, 6, 11
-; BE-NEXT: or 7, 9, 7
-; BE-NEXT: or 8, 8, 10
+; BE-NEXT: or 10, 11, 10
+; BE-NEXT: or 6, 6, 8
+; BE-NEXT: or 7, 7, 9
; BE-NEXT: std 3, 24(5)
-; BE-NEXT: std 8, 16(5)
+; BE-NEXT: std 7, 16(5)
; BE-NEXT: std 6, 0(5)
-; BE-NEXT: std 7, 8(5)
+; BE-NEXT: std 10, 8(5)
; BE-NEXT: blr
;
; LE-32BIT-LABEL: shl_32bytes:
@@ -731,7 +700,6 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT: stw 6, 80(1)
; LE-32BIT-NEXT: stw 6, 76(1)
; LE-32BIT-NEXT: stw 6, 72(1)
; LE-32BIT-NEXT: stw 6, 68(1)
@@ -739,61 +707,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 6, 60(1)
; LE-32BIT-NEXT: stw 6, 56(1)
; LE-32BIT-NEXT: stw 6, 52(1)
-; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 31
-; LE-32BIT-NEXT: stw 3, 48(1)
-; LE-32BIT-NEXT: addi 3, 1, 20
-; LE-32BIT-NEXT: stw 0, 44(1)
-; LE-32BIT-NEXT: stw 12, 40(1)
-; LE-32BIT-NEXT: stw 11, 36(1)
-; LE-32BIT-NEXT: stw 10, 32(1)
-; LE-32BIT-NEXT: stw 9, 28(1)
-; LE-32BIT-NEXT: stw 8, 24(1)
-; LE-32BIT-NEXT: li 8, 7
-; LE-32BIT-NEXT: stw 7, 20(1)
-; LE-32BIT-NEXT: nand 8, 4, 8
+; LE-32BIT-NEXT: stw 6, 48(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 29
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: addi 3, 1, 16
+; LE-32BIT-NEXT: stw 0, 40(1)
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
+; LE-32BIT-NEXT: stw 12, 36(1)
+; LE-32BIT-NEXT: subfic 12, 4, 32
+; LE-32BIT-NEXT: stw 11, 32(1)
+; LE-32BIT-NEXT: stw 10, 28(1)
+; LE-32BIT-NEXT: stw 9, 24(1)
+; LE-32BIT-NEXT: stw 8, 20(1)
+; LE-32BIT-NEXT: stw 7, 16(1)
; LE-32BIT-NEXT: lwzux 3, 6, 3
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
-; LE-32BIT-NEXT: subfic 0, 4, 32
-; LE-32BIT-NEXT: clrlwi 8, 8, 27
; LE-32BIT-NEXT: lwz 7, 8(6)
; LE-32BIT-NEXT: slw 3, 3, 4
-; LE-32BIT-NEXT: lwz 9, 4(6)
-; LE-32BIT-NEXT: lwz 10, 16(6)
-; LE-32BIT-NEXT: srwi 29, 7, 1
-; LE-32BIT-NEXT: lwz 11, 12(6)
-; LE-32BIT-NEXT: slw 28, 9, 4
-; LE-32BIT-NEXT: lwz 12, 24(6)
-; LE-32BIT-NEXT: srwi 27, 10, 1
-; LE-32BIT-NEXT: lwz 30, 20(6)
-; LE-32BIT-NEXT: slw 26, 11, 4
+; LE-32BIT-NEXT: lwz 8, 4(6)
+; LE-32BIT-NEXT: lwz 9, 16(6)
+; LE-32BIT-NEXT: srw 30, 7, 12
+; LE-32BIT-NEXT: lwz 10, 12(6)
+; LE-32BIT-NEXT: slw 29, 8, 4
+; LE-32BIT-NEXT: lwz 11, 24(6)
+; LE-32BIT-NEXT: srw 8, 8, 12
+; LE-32BIT-NEXT: lwz 0, 20(6)
+; LE-32BIT-NEXT: srw 28, 9, 12
; LE-32BIT-NEXT: lwz 6, 28(6)
-; LE-32BIT-NEXT: srw 9, 9, 0
-; LE-32BIT-NEXT: slw 25, 30, 4
-; LE-32BIT-NEXT: srw 11, 11, 0
+; LE-32BIT-NEXT: slw 27, 10, 4
+; LE-32BIT-NEXT: srw 10, 10, 12
; LE-32BIT-NEXT: slw 7, 7, 4
-; LE-32BIT-NEXT: srw 30, 30, 0
-; LE-32BIT-NEXT: slw 10, 10, 4
-; LE-32BIT-NEXT: srw 0, 6, 0
-; LE-32BIT-NEXT: slw 6, 6, 4
-; LE-32BIT-NEXT: slw 4, 12, 4
-; LE-32BIT-NEXT: srwi 12, 12, 1
-; LE-32BIT-NEXT: srw 29, 29, 8
-; LE-32BIT-NEXT: srw 27, 27, 8
-; LE-32BIT-NEXT: srw 8, 12, 8
-; LE-32BIT-NEXT: or 3, 3, 9
-; LE-32BIT-NEXT: or 4, 4, 0
-; LE-32BIT-NEXT: stw 3, 0(5)
-; LE-32BIT-NEXT: or 3, 25, 8
+; LE-32BIT-NEXT: srw 26, 11, 12
+; LE-32BIT-NEXT: slw 25, 0, 4
+; LE-32BIT-NEXT: srw 0, 0, 12
+; LE-32BIT-NEXT: slw 9, 9, 4
+; LE-32BIT-NEXT: srw 12, 6, 12
+; LE-32BIT-NEXT: slw 11, 11, 4
+; LE-32BIT-NEXT: slw 4, 6, 4
+; LE-32BIT-NEXT: stw 4, 28(5)
+; LE-32BIT-NEXT: or 4, 11, 12
; LE-32BIT-NEXT: stw 4, 24(5)
-; LE-32BIT-NEXT: or 4, 10, 30
-; LE-32BIT-NEXT: stw 3, 20(5)
-; LE-32BIT-NEXT: or 3, 26, 27
+; LE-32BIT-NEXT: or 4, 9, 0
; LE-32BIT-NEXT: stw 4, 16(5)
-; LE-32BIT-NEXT: or 4, 7, 11
-; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: or 3, 28, 29
-; LE-32BIT-NEXT: stw 6, 28(5)
+; LE-32BIT-NEXT: or 4, 25, 26
+; LE-32BIT-NEXT: stw 4, 20(5)
+; LE-32BIT-NEXT: or 4, 7, 10
+; LE-32BIT-NEXT: or 3, 3, 8
; LE-32BIT-NEXT: stw 4, 8(5)
+; LE-32BIT-NEXT: or 4, 27, 28
+; LE-32BIT-NEXT: stw 3, 0(5)
+; LE-32BIT-NEXT: or 3, 29, 30
+; LE-32BIT-NEXT: stw 4, 12(5)
; LE-32BIT-NEXT: stw 3, 4(5)
; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
@@ -812,98 +775,91 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-64BIT-LABEL: ashr_32bytes:
; LE-64BIT: # %bb.0:
-; LE-64BIT-NEXT: lxvd2x 0, 0, 3
; LE-64BIT-NEXT: ld 6, 24(3)
+; LE-64BIT-NEXT: lxvd2x 0, 0, 3
; LE-64BIT-NEXT: lwz 4, 0(4)
; LE-64BIT-NEXT: addi 7, 1, -64
; LE-64BIT-NEXT: ld 3, 16(3)
; LE-64BIT-NEXT: sradi 8, 6, 63
-; LE-64BIT-NEXT: rlwinm 9, 4, 29, 27, 31
-; LE-64BIT-NEXT: std 6, 24(7)
-; LE-64BIT-NEXT: std 3, 16(7)
-; LE-64BIT-NEXT: li 3, 7
-; LE-64BIT-NEXT: std 8, 56(7)
-; LE-64BIT-NEXT: std 8, 48(7)
-; LE-64BIT-NEXT: std 8, 40(7)
-; LE-64BIT-NEXT: std 8, 32(7)
+; LE-64BIT-NEXT: rlwinm 9, 4, 29, 27, 28
+; LE-64BIT-NEXT: clrlwi 4, 4, 26
; LE-64BIT-NEXT: stxvd2x 0, 0, 7
-; LE-64BIT-NEXT: nand 3, 4, 3
-; LE-64BIT-NEXT: clrlwi 4, 4, 29
-; LE-64BIT-NEXT: ldux 6, 9, 7
-; LE-64BIT-NEXT: ld 7, 16(9)
+; LE-64BIT-NEXT: std 6, -40(1)
+; LE-64BIT-NEXT: std 3, -48(1)
+; LE-64BIT-NEXT: std 8, -8(1)
+; LE-64BIT-NEXT: std 8, -16(1)
+; LE-64BIT-NEXT: std 8, -24(1)
+; LE-64BIT-NEXT: std 8, -32(1)
+; LE-64BIT-NEXT: ldux 3, 9, 7
+; LE-64BIT-NEXT: xori 7, 4, 63
+; LE-64BIT-NEXT: ld 6, 16(9)
; LE-64BIT-NEXT: ld 8, 8(9)
-; LE-64BIT-NEXT: clrlwi 3, 3, 26
; LE-64BIT-NEXT: ld 9, 24(9)
+; LE-64BIT-NEXT: srd 3, 3, 4
+; LE-64BIT-NEXT: sldi 11, 6, 1
+; LE-64BIT-NEXT: srd 10, 8, 4
; LE-64BIT-NEXT: srd 6, 6, 4
-; LE-64BIT-NEXT: sldi 10, 7, 1
-; LE-64BIT-NEXT: srd 11, 8, 4
-; LE-64BIT-NEXT: srd 7, 7, 4
-; LE-64BIT-NEXT: sld 3, 10, 3
+; LE-64BIT-NEXT: sld 7, 11, 7
+; LE-64BIT-NEXT: or 7, 10, 7
; LE-64BIT-NEXT: subfic 10, 4, 64
; LE-64BIT-NEXT: srad 4, 9, 4
-; LE-64BIT-NEXT: or 3, 11, 3
-; LE-64BIT-NEXT: sld 11, 9, 10
; LE-64BIT-NEXT: sld 8, 8, 10
+; LE-64BIT-NEXT: sld 11, 9, 10
; LE-64BIT-NEXT: std 4, 24(5)
-; LE-64BIT-NEXT: or 6, 8, 6
-; LE-64BIT-NEXT: or 4, 11, 7
-; LE-64BIT-NEXT: std 3, 8(5)
-; LE-64BIT-NEXT: std 6, 0(5)
-; LE-64BIT-NEXT: std 4, 16(5)
+; LE-64BIT-NEXT: std 7, 8(5)
+; LE-64BIT-NEXT: or 3, 8, 3
+; LE-64BIT-NEXT: std 3, 0(5)
+; LE-64BIT-NEXT: or 3, 11, 6
+; LE-64BIT-NEXT: std 3, 16(5)
; LE-64BIT-NEXT: blr
;
; BE-LABEL: ashr_32bytes:
; BE: # %bb.0:
-; BE-NEXT: ld 6, 0(3)
-; BE-NEXT: ld 7, 8(3)
-; BE-NEXT: ld 8, 16(3)
+; BE-NEXT: ld 7, 0(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 9, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: addi 10, 1, -32
-; BE-NEXT: std 3, 56(9)
-; BE-NEXT: std 6, 32(9)
-; BE-NEXT: sradi 3, 6, 63
-; BE-NEXT: rlwinm 6, 4, 29, 27, 31
-; BE-NEXT: std 3, 24(9)
-; BE-NEXT: std 3, 16(9)
-; BE-NEXT: std 3, 8(9)
+; BE-NEXT: addi 6, 1, -32
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: std 7, -32(1)
+; BE-NEXT: sradi 3, 7, 63
+; BE-NEXT: rlwinm 7, 4, 29, 27, 28
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 3, -48(1)
+; BE-NEXT: std 3, -56(1)
; BE-NEXT: std 3, -64(1)
-; BE-NEXT: neg 3, 6
-; BE-NEXT: std 8, 48(9)
-; BE-NEXT: std 7, 40(9)
+; BE-NEXT: neg 3, 7
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 8, -24(1)
; BE-NEXT: extsw 3, 3
-; BE-NEXT: ldux 3, 10, 3
-; BE-NEXT: li 6, 7
-; BE-NEXT: nand 6, 4, 6
-; BE-NEXT: clrlwi 4, 4, 29
-; BE-NEXT: clrlwi 6, 6, 26
-; BE-NEXT: ld 7, 8(10)
-; BE-NEXT: ld 8, 16(10)
-; BE-NEXT: ld 9, 24(10)
-; BE-NEXT: subfic 10, 4, 64
-; BE-NEXT: sldi 11, 7, 1
-; BE-NEXT: srd 7, 7, 4
-; BE-NEXT: srd 9, 9, 4
-; BE-NEXT: sld 6, 11, 6
-; BE-NEXT: sld 11, 3, 10
-; BE-NEXT: sld 10, 8, 10
-; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: ldux 3, 6, 3
+; BE-NEXT: clrlwi 4, 4, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 8(6)
+; BE-NEXT: ld 8, 24(6)
+; BE-NEXT: ld 6, 16(6)
+; BE-NEXT: sld 10, 3, 9
; BE-NEXT: srad 3, 3, 4
-; BE-NEXT: or 7, 11, 7
-; BE-NEXT: or 6, 8, 6
-; BE-NEXT: or 8, 10, 9
; BE-NEXT: std 3, 0(5)
-; BE-NEXT: std 8, 24(5)
-; BE-NEXT: std 7, 8(5)
+; BE-NEXT: srd 11, 7, 4
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: sld 7, 7, 9
+; BE-NEXT: sld 9, 6, 9
+; BE-NEXT: srd 6, 6, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 8, 9, 8
+; BE-NEXT: or 6, 7, 6
; BE-NEXT: std 6, 16(5)
+; BE-NEXT: std 8, 24(5)
+; BE-NEXT: std 10, 8(5)
; BE-NEXT: blr
;
; LE-32BIT-LABEL: ashr_32bytes:
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -112(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
-; LE-32BIT-NEXT: addi 6, 1, 52
+; LE-32BIT-NEXT: addi 6, 1, 48
; LE-32BIT-NEXT: lwz 8, 4(3)
; LE-32BIT-NEXT: lwz 9, 8(3)
; LE-32BIT-NEXT: lwz 10, 12(3)
@@ -912,76 +868,72 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 0, 24(3)
; LE-32BIT-NEXT: lwz 3, 28(3)
; LE-32BIT-NEXT: lwz 4, 28(4)
-; LE-32BIT-NEXT: stw 3, 80(1)
+; LE-32BIT-NEXT: stw 3, 76(1)
; LE-32BIT-NEXT: srawi 3, 7, 31
-; LE-32BIT-NEXT: stw 7, 52(1)
-; LE-32BIT-NEXT: rlwinm 7, 4, 29, 27, 31
+; LE-32BIT-NEXT: stw 7, 48(1)
+; LE-32BIT-NEXT: rlwinm 7, 4, 29, 27, 29
; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT: stw 0, 76(1)
-; LE-32BIT-NEXT: stw 12, 72(1)
-; LE-32BIT-NEXT: stw 11, 68(1)
-; LE-32BIT-NEXT: stw 10, 64(1)
-; LE-32BIT-NEXT: stw 9, 60(1)
-; LE-32BIT-NEXT: li 9, 7
-; LE-32BIT-NEXT: stw 8, 56(1)
-; LE-32BIT-NEXT: nand 9, 4, 9
-; LE-32BIT-NEXT: stw 3, 48(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
-; LE-32BIT-NEXT: stw 3, 44(1)
; LE-32BIT-NEXT: subfic 30, 4, 32
+; LE-32BIT-NEXT: stw 0, 72(1)
+; LE-32BIT-NEXT: stw 12, 68(1)
+; LE-32BIT-NEXT: xori 12, 4, 31
+; LE-32BIT-NEXT: stw 11, 64(1)
+; LE-32BIT-NEXT: stw 10, 60(1)
+; LE-32BIT-NEXT: stw 9, 56(1)
+; LE-32BIT-NEXT: stw 8, 52(1)
+; LE-32BIT-NEXT: stw 3, 44(1)
; LE-32BIT-NEXT: stw 3, 40(1)
-; LE-32BIT-NEXT: clrlwi 9, 9, 27
; LE-32BIT-NEXT: stw 3, 36(1)
; LE-32BIT-NEXT: stw 3, 32(1)
; LE-32BIT-NEXT: stw 3, 28(1)
; LE-32BIT-NEXT: stw 3, 24(1)
; LE-32BIT-NEXT: stw 3, 20(1)
+; LE-32BIT-NEXT: stw 3, 16(1)
; LE-32BIT-NEXT: sub 3, 6, 7
-; LE-32BIT-NEXT: lwz 6, 4(3)
-; LE-32BIT-NEXT: lwz 7, 8(3)
-; LE-32BIT-NEXT: lwz 8, 12(3)
-; LE-32BIT-NEXT: slwi 29, 6, 1
-; LE-32BIT-NEXT: lwz 10, 16(3)
-; LE-32BIT-NEXT: srw 28, 7, 4
-; LE-32BIT-NEXT: lwz 11, 20(3)
-; LE-32BIT-NEXT: slwi 27, 8, 1
-; LE-32BIT-NEXT: lwz 12, 24(3)
+; LE-32BIT-NEXT: lwz 6, 8(3)
+; LE-32BIT-NEXT: lwz 7, 4(3)
+; LE-32BIT-NEXT: lwz 8, 0(3)
+; LE-32BIT-NEXT: srw 29, 6, 4
+; LE-32BIT-NEXT: lwz 9, 12(3)
+; LE-32BIT-NEXT: slw 6, 6, 30
+; LE-32BIT-NEXT: lwz 10, 20(3)
+; LE-32BIT-NEXT: slw 28, 8, 30
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: srw 27, 9, 4
+; LE-32BIT-NEXT: lwz 0, 28(3)
; LE-32BIT-NEXT: srw 26, 10, 4
-; LE-32BIT-NEXT: lwz 0, 0(3)
-; LE-32BIT-NEXT: srw 6, 6, 4
-; LE-32BIT-NEXT: lwz 3, 28(3)
-; LE-32BIT-NEXT: srw 25, 12, 4
-; LE-32BIT-NEXT: slw 12, 12, 30
-; LE-32BIT-NEXT: slw 7, 7, 30
-; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: lwz 3, 24(3)
+; LE-32BIT-NEXT: slw 25, 11, 30
+; LE-32BIT-NEXT: slw 9, 9, 30
; LE-32BIT-NEXT: slw 10, 10, 30
-; LE-32BIT-NEXT: slw 30, 0, 30
-; LE-32BIT-NEXT: srw 8, 8, 4
-; LE-32BIT-NEXT: sraw 0, 0, 4
-; LE-32BIT-NEXT: srw 4, 11, 4
-; LE-32BIT-NEXT: or 3, 12, 3
+; LE-32BIT-NEXT: slw 30, 3, 30
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: srw 0, 0, 4
+; LE-32BIT-NEXT: or 3, 10, 3
+; LE-32BIT-NEXT: srw 11, 11, 4
+; LE-32BIT-NEXT: stw 3, 24(5)
+; LE-32BIT-NEXT: or 3, 30, 0
; LE-32BIT-NEXT: stw 3, 28(5)
-; LE-32BIT-NEXT: or 3, 10, 4
-; LE-32BIT-NEXT: slwi 11, 11, 1
+; LE-32BIT-NEXT: or 3, 9, 11
+; LE-32BIT-NEXT: stw 3, 16(5)
+; LE-32BIT-NEXT: or 3, 25, 26
+; LE-32BIT-NEXT: sraw 8, 8, 4
+; LE-32BIT-NEXT: srw 4, 7, 4
+; LE-32BIT-NEXT: slwi 7, 7, 1
; LE-32BIT-NEXT: stw 3, 20(5)
-; LE-32BIT-NEXT: or 3, 7, 8
-; LE-32BIT-NEXT: slw 29, 29, 9
-; LE-32BIT-NEXT: slw 27, 27, 9
-; LE-32BIT-NEXT: slw 9, 11, 9
+; LE-32BIT-NEXT: or 3, 6, 27
+; LE-32BIT-NEXT: slw 7, 7, 12
; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: or 3, 30, 6
+; LE-32BIT-NEXT: or 3, 28, 4
; LE-32BIT-NEXT: stw 3, 4(5)
-; LE-32BIT-NEXT: or 3, 25, 9
-; LE-32BIT-NEXT: stw 3, 24(5)
-; LE-32BIT-NEXT: or 3, 26, 27
-; LE-32BIT-NEXT: stw 3, 16(5)
-; LE-32BIT-NEXT: or 3, 28, 29
-; LE-32BIT-NEXT: stw 0, 0(5)
+; LE-32BIT-NEXT: or 3, 29, 7
+; LE-32BIT-NEXT: stw 8, 0(5)
; LE-32BIT-NEXT: stw 3, 8(5)
; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index f61cbfd3ed7257..5ba8755201ddf5 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -157,106 +157,33 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a5, 8(a1)
; RV32I-NEXT: lw a1, 12(a1)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb zero, 27(sp)
-; RV32I-NEXT: sb zero, 26(sp)
-; RV32I-NEXT: sb zero, 25(sp)
-; RV32I-NEXT: sb zero, 24(sp)
-; RV32I-NEXT: sb zero, 23(sp)
-; RV32I-NEXT: sb zero, 22(sp)
-; RV32I-NEXT: sb zero, 21(sp)
-; RV32I-NEXT: sb zero, 20(sp)
-; RV32I-NEXT: sb zero, 19(sp)
-; RV32I-NEXT: sb zero, 18(sp)
-; RV32I-NEXT: sb zero, 17(sp)
-; RV32I-NEXT: sb zero, 16(sp)
-; RV32I-NEXT: sb a1, 12(sp)
-; RV32I-NEXT: sb a5, 8(sp)
-; RV32I-NEXT: sb a4, 4(sp)
-; RV32I-NEXT: sb a3, 0(sp)
-; RV32I-NEXT: srli a6, a1, 24
-; RV32I-NEXT: sb a6, 15(sp)
-; RV32I-NEXT: srli a6, a1, 16
-; RV32I-NEXT: sb a6, 14(sp)
-; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 13(sp)
-; RV32I-NEXT: srli a1, a5, 24
-; RV32I-NEXT: sb a1, 11(sp)
-; RV32I-NEXT: srli a1, a5, 16
-; RV32I-NEXT: sb a1, 10(sp)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 9(sp)
-; RV32I-NEXT: srli a1, a4, 24
-; RV32I-NEXT: sb a1, 7(sp)
-; RV32I-NEXT: srli a1, a4, 16
-; RV32I-NEXT: sb a1, 6(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 5(sp)
-; RV32I-NEXT: srli a1, a3, 24
-; RV32I-NEXT: sb a1, 3(sp)
-; RV32I-NEXT: srli a1, a3, 16
-; RV32I-NEXT: sb a1, 2(sp)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 1(sp)
-; RV32I-NEXT: slli a1, a2, 25
-; RV32I-NEXT: srli a1, a1, 28
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw a1, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a1, a2, 3
+; RV32I-NEXT: andi a1, a1, 12
; RV32I-NEXT: mv a3, sp
; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: lbu a3, 1(a1)
-; RV32I-NEXT: lbu a4, 0(a1)
-; RV32I-NEXT: lbu a5, 2(a1)
-; RV32I-NEXT: lbu a6, 3(a1)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: andi a2, a2, 7
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: srl a3, a3, a2
-; RV32I-NEXT: lbu a4, 5(a1)
-; RV32I-NEXT: lbu a5, 4(a1)
-; RV32I-NEXT: lbu a6, 6(a1)
-; RV32I-NEXT: lbu a7, 7(a1)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: or a4, a4, a5
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a5, a7, a6
-; RV32I-NEXT: or a4, a5, a4
; RV32I-NEXT: slli a5, a4, 1
-; RV32I-NEXT: xori a6, a2, 31
+; RV32I-NEXT: andi a6, a2, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 8(a1)
; RV32I-NEXT: sll a5, a5, a6
; RV32I-NEXT: or a3, a3, a5
; RV32I-NEXT: srl a4, a4, a2
-; RV32I-NEXT: lbu a5, 9(a1)
-; RV32I-NEXT: lbu a7, 8(a1)
-; RV32I-NEXT: lbu t0, 10(a1)
-; RV32I-NEXT: lbu t1, 11(a1)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: slli a7, a5, 1
-; RV32I-NEXT: not t0, a2
-; RV32I-NEXT: lbu t1, 13(a1)
-; RV32I-NEXT: sll a7, a7, t0
-; RV32I-NEXT: or a4, a4, a7
-; RV32I-NEXT: lbu a7, 12(a1)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: lbu t0, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
-; RV32I-NEXT: or a7, t1, a7
-; RV32I-NEXT: srl a5, a5, a2
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t0
-; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: slli a5, a7, 1
+; RV32I-NEXT: lw a1, 12(a1)
+; RV32I-NEXT: sll a5, a5, a6
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: srl a5, a7, a2
; RV32I-NEXT: slli a7, a1, 1
; RV32I-NEXT: sll a6, a7, a6
; RV32I-NEXT: or a5, a5, a6
@@ -299,110 +226,34 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw a4, 8(a1)
; RV32I-NEXT: lw a5, 4(a1)
; RV32I-NEXT: lw a1, 0(a1)
-; RV32I-NEXT: sb a3, 12(sp)
-; RV32I-NEXT: sb a4, 8(sp)
-; RV32I-NEXT: sb a5, 4(sp)
-; RV32I-NEXT: sb a1, 0(sp)
-; RV32I-NEXT: srai a6, a3, 31
-; RV32I-NEXT: sb a6, 28(sp)
-; RV32I-NEXT: sb a6, 24(sp)
-; RV32I-NEXT: sb a6, 20(sp)
-; RV32I-NEXT: sb a6, 16(sp)
-; RV32I-NEXT: srli a7, a3, 24
-; RV32I-NEXT: sb a7, 15(sp)
-; RV32I-NEXT: srli a7, a3, 16
-; RV32I-NEXT: sb a7, 14(sp)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 13(sp)
-; RV32I-NEXT: srli a3, a4, 24
-; RV32I-NEXT: sb a3, 11(sp)
-; RV32I-NEXT: srli a3, a4, 16
-; RV32I-NEXT: sb a3, 10(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 9(sp)
-; RV32I-NEXT: srli a3, a5, 24
-; RV32I-NEXT: sb a3, 7(sp)
-; RV32I-NEXT: srli a3, a5, 16
-; RV32I-NEXT: sb a3, 6(sp)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 5(sp)
-; RV32I-NEXT: srli a3, a1, 24
-; RV32I-NEXT: sb a3, 3(sp)
-; RV32I-NEXT: srli a3, a1, 16
-; RV32I-NEXT: sb a3, 2(sp)
-; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 1(sp)
-; RV32I-NEXT: srli a1, a6, 24
-; RV32I-NEXT: sb a1, 31(sp)
-; RV32I-NEXT: srli a3, a6, 16
-; RV32I-NEXT: sb a3, 30(sp)
-; RV32I-NEXT: srli a4, a6, 8
-; RV32I-NEXT: sb a4, 29(sp)
-; RV32I-NEXT: sb a1, 27(sp)
-; RV32I-NEXT: sb a3, 26(sp)
-; RV32I-NEXT: sb a4, 25(sp)
-; RV32I-NEXT: sb a1, 23(sp)
-; RV32I-NEXT: sb a3, 22(sp)
-; RV32I-NEXT: sb a4, 21(sp)
-; RV32I-NEXT: sb a1, 19(sp)
-; RV32I-NEXT: sb a3, 18(sp)
-; RV32I-NEXT: sb a4, 17(sp)
-; RV32I-NEXT: slli a1, a2, 25
-; RV32I-NEXT: srli a1, a1, 28
+; RV32I-NEXT: sw a3, 12(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a5, 4(sp)
+; RV32I-NEXT: sw a1, 0(sp)
+; RV32I-NEXT: srai a3, a3, 31
+; RV32I-NEXT: sw a3, 28(sp)
+; RV32I-NEXT: sw a3, 24(sp)
+; RV32I-NEXT: sw a3, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: srli a1, a2, 3
+; RV32I-NEXT: andi a1, a1, 12
; RV32I-NEXT: mv a3, sp
; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: lbu a3, 1(a1)
-; RV32I-NEXT: lbu a4, 0(a1)
-; RV32I-NEXT: lbu a5, 2(a1)
-; RV32I-NEXT: lbu a6, 3(a1)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: andi a2, a2, 7
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: srl a3, a3, a2
-; RV32I-NEXT: lbu a4, 5(a1)
-; RV32I-NEXT: lbu a5, 4(a1)
-; RV32I-NEXT: lbu a6, 6(a1)
-; RV32I-NEXT: lbu a7, 7(a1)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: or a4, a4, a5
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a5, a7, a6
-; RV32I-NEXT: or a4, a5, a4
; RV32I-NEXT: slli a5, a4, 1
-; RV32I-NEXT: xori a6, a2, 31
+; RV32I-NEXT: andi a6, a2, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 8(a1)
; RV32I-NEXT: sll a5, a5, a6
; RV32I-NEXT: or a3, a3, a5
; RV32I-NEXT: srl a4, a4, a2
-; RV32I-NEXT: lbu a5, 9(a1)
-; RV32I-NEXT: lbu a7, 8(a1)
-; RV32I-NEXT: lbu t0, 10(a1)
-; RV32I-NEXT: lbu t1, 11(a1)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: slli a7, a5, 1
-; RV32I-NEXT: not t0, a2
-; RV32I-NEXT: lbu t1, 13(a1)
-; RV32I-NEXT: sll a7, a7, t0
-; RV32I-NEXT: or a4, a4, a7
-; RV32I-NEXT: lbu a7, 12(a1)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: lbu t0, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
-; RV32I-NEXT: or a7, t1, a7
-; RV32I-NEXT: srl a5, a5, a2
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t0
-; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: slli a5, a7, 1
+; RV32I-NEXT: lw a1, 12(a1)
+; RV32I-NEXT: sll a5, a5, a6
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: srl a5, a7, a2
; RV32I-NEXT: slli a7, a1, 1
; RV32I-NEXT: sll a6, a7, a6
; RV32I-NEXT: or a5, a5, a6
@@ -445,114 +296,41 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a5, 8(a1)
; RV32I-NEXT: lw a1, 12(a1)
-; RV32I-NEXT: sb zero, 15(sp)
-; RV32I-NEXT: sb zero, 14(sp)
-; RV32I-NEXT: sb zero, 13(sp)
-; RV32I-NEXT: sb zero, 12(sp)
-; RV32I-NEXT: sb zero, 11(sp)
-; RV32I-NEXT: sb zero, 10(sp)
-; RV32I-NEXT: sb zero, 9(sp)
-; RV32I-NEXT: sb zero, 8(sp)
-; RV32I-NEXT: sb zero, 7(sp)
-; RV32I-NEXT: sb zero, 6(sp)
-; RV32I-NEXT: sb zero, 5(sp)
-; RV32I-NEXT: sb zero, 4(sp)
-; RV32I-NEXT: sb zero, 3(sp)
-; RV32I-NEXT: sb zero, 2(sp)
-; RV32I-NEXT: sb zero, 1(sp)
-; RV32I-NEXT: sb zero, 0(sp)
-; RV32I-NEXT: sb a1, 28(sp)
-; RV32I-NEXT: sb a5, 24(sp)
-; RV32I-NEXT: sb a4, 20(sp)
-; RV32I-NEXT: sb a3, 16(sp)
-; RV32I-NEXT: srli a6, a1, 24
-; RV32I-NEXT: sb a6, 31(sp)
-; RV32I-NEXT: srli a6, a1, 16
-; RV32I-NEXT: sb a6, 30(sp)
-; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 29(sp)
-; RV32I-NEXT: srli a1, a5, 24
-; RV32I-NEXT: sb a1, 27(sp)
-; RV32I-NEXT: srli a1, a5, 16
-; RV32I-NEXT: sb a1, 26(sp)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 25(sp)
-; RV32I-NEXT: srli a1, a4, 24
-; RV32I-NEXT: sb a1, 23(sp)
-; RV32I-NEXT: srli a1, a4, 16
-; RV32I-NEXT: sb a1, 22(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 21(sp)
-; RV32I-NEXT: srli a1, a3, 24
-; RV32I-NEXT: sb a1, 19(sp)
-; RV32I-NEXT: srli a1, a3, 16
-; RV32I-NEXT: sb a1, 18(sp)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 17(sp)
-; RV32I-NEXT: slli a1, a2, 25
-; RV32I-NEXT: srli a1, a1, 28
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a1, 28(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: srli a1, a2, 3
+; RV32I-NEXT: andi a1, a1, 12
; RV32I-NEXT: addi a3, sp, 16
-; RV32I-NEXT: sub a1, a3, a1
-; RV32I-NEXT: lbu a3, 5(a1)
-; RV32I-NEXT: lbu a4, 4(a1)
-; RV32I-NEXT: lbu a5, 6(a1)
-; RV32I-NEXT: lbu a6, 7(a1)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: andi a2, a2, 7
-; RV32I-NEXT: sll a4, a3, a2
-; RV32I-NEXT: lbu a5, 1(a1)
-; RV32I-NEXT: lbu a6, 0(a1)
-; RV32I-NEXT: lbu a7, 2(a1)
-; RV32I-NEXT: lbu t0, 3(a1)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a6
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: srli a6, a5, 1
-; RV32I-NEXT: xori a7, a2, 31
+; RV32I-NEXT: sub a3, a3, a1
+; RV32I-NEXT: lw a1, 4(a3)
+; RV32I-NEXT: lw a4, 0(a3)
+; RV32I-NEXT: sll a5, a1, a2
+; RV32I-NEXT: srli a6, a4, 1
+; RV32I-NEXT: andi a7, a2, 31
+; RV32I-NEXT: lw t0, 8(a3)
+; RV32I-NEXT: xori a7, a7, 31
; RV32I-NEXT: srl a6, a6, a7
-; RV32I-NEXT: or a4, a4, a6
-; RV32I-NEXT: lbu a6, 9(a1)
-; RV32I-NEXT: lbu t0, 8(a1)
-; RV32I-NEXT: lbu t1, 10(a1)
-; RV32I-NEXT: lbu t2, 11(a1)
-; RV32I-NEXT: slli a6, a6, 8
-; RV32I-NEXT: or a6, a6, t0
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or a6, t0, a6
-; RV32I-NEXT: sll t0, a6, a2
-; RV32I-NEXT: srli a3, a3, 1
-; RV32I-NEXT: not t1, a2
-; RV32I-NEXT: srl a3, a3, t1
-; RV32I-NEXT: or a3, t0, a3
-; RV32I-NEXT: lbu t0, 13(a1)
-; RV32I-NEXT: lbu t1, 12(a1)
-; RV32I-NEXT: lbu t2, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or t0, t0, t1
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t2
-; RV32I-NEXT: or a1, a1, t0
-; RV32I-NEXT: sll a1, a1, a2
-; RV32I-NEXT: srli a6, a6, 1
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: sll a6, t0, a2
+; RV32I-NEXT: lw a3, 12(a3)
+; RV32I-NEXT: srli a1, a1, 1
+; RV32I-NEXT: srl a1, a1, a7
+; RV32I-NEXT: or a1, a6, a1
+; RV32I-NEXT: sll a3, a3, a2
+; RV32I-NEXT: srli a6, t0, 1
; RV32I-NEXT: srl a6, a6, a7
-; RV32I-NEXT: or a1, a1, a6
-; RV32I-NEXT: sll a2, a5, a2
+; RV32I-NEXT: or a3, a3, a6
+; RV32I-NEXT: sll a2, a4, a2
; RV32I-NEXT: sw a2, 0(a0)
-; RV32I-NEXT: sw a1, 12(a0)
-; RV32I-NEXT: sw a3, 8(a0)
-; RV32I-NEXT: sw a4, 4(a0)
+; RV32I-NEXT: sw a3, 12(a0)
+; RV32I-NEXT: sw a1, 8(a0)
+; RV32I-NEXT: sw a5, 4(a0)
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index b0d435368e92bd..29fe0a7de6b3d4 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -723,98 +723,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: lshr_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -48
-; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
; RV32I-NEXT: lbu a0, 15(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb zero, 35(sp)
-; RV32I-NEXT: sb zero, 34(sp)
-; RV32I-NEXT: sb zero, 33(sp)
-; RV32I-NEXT: sb zero, 32(sp)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb zero, 27(sp)
-; RV32I-NEXT: sb zero, 26(sp)
-; RV32I-NEXT: sb zero, 25(sp)
-; RV32I-NEXT: sb zero, 24(sp)
-; RV32I-NEXT: sb zero, 23(sp)
-; RV32I-NEXT: sb zero, 22(sp)
-; RV32I-NEXT: sb zero, 21(sp)
-; RV32I-NEXT: sb zero, 20(sp)
-; RV32I-NEXT: sb a0, 19(sp)
-; RV32I-NEXT: sb s2, 18(sp)
-; RV32I-NEXT: sb s1, 17(sp)
-; RV32I-NEXT: sb s0, 16(sp)
-; RV32I-NEXT: sb t6, 15(sp)
-; RV32I-NEXT: sb t5, 14(sp)
-; RV32I-NEXT: sb t4, 13(sp)
-; RV32I-NEXT: sb t3, 12(sp)
-; RV32I-NEXT: sb t2, 11(sp)
-; RV32I-NEXT: sb t1, 10(sp)
-; RV32I-NEXT: sb t0, 9(sp)
-; RV32I-NEXT: sb a7, 8(sp)
-; RV32I-NEXT: sb a6, 7(sp)
-; RV32I-NEXT: sb a5, 6(sp)
-; RV32I-NEXT: sb a4, 5(sp)
-; RV32I-NEXT: sb a3, 4(sp)
-; RV32I-NEXT: andi a1, a1, 15
-; RV32I-NEXT: addi a0, sp, 4
-; RV32I-NEXT: add a0, a0, a1
-; RV32I-NEXT: lbu a1, 5(a0)
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: lbu a4, 7(a0)
-; RV32I-NEXT: lbu a5, 6(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 12(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu t5, 14(a0)
-; RV32I-NEXT: lbu t6, 10(a0)
-; RV32I-NEXT: lbu s0, 11(a0)
-; RV32I-NEXT: lbu s1, 8(a0)
-; RV32I-NEXT: lbu a0, 9(a0)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb a0, 9(a2)
-; RV32I-NEXT: sb t5, 14(a2)
-; RV32I-NEXT: sb t4, 15(a2)
-; RV32I-NEXT: sb t3, 12(a2)
-; RV32I-NEXT: sb t2, 13(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: andi a0, a1, 12
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a0, a3, a0
+; RV32I-NEXT: lw a3, 4(a0)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: srl a4, a3, a1
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: andi a6, a1, 24
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 0(a0)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or t0, a4, t0
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: sll a3, a3, a6
+; RV32I-NEXT: or a3, a7, a3
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli t1, a0, 1
+; RV32I-NEXT: sll a6, t1, a6
+; RV32I-NEXT: or a6, a5, a6
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a6, 1(a2)
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: sb a4, 7(a2)
-; RV32I-NEXT: sb a3, 4(a2)
-; RV32I-NEXT: sb a1, 5(a2)
-; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 48
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 15(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: srli a0, a6, 16
+; RV32I-NEXT: sb a0, 10(a2)
+; RV32I-NEXT: srli a0, a6, 24
+; RV32I-NEXT: sb a0, 11(a2)
+; RV32I-NEXT: srli a0, a6, 8
+; RV32I-NEXT: sb a0, 9(a2)
+; RV32I-NEXT: srli a0, a3, 16
+; RV32I-NEXT: sb a0, 2(a2)
+; RV32I-NEXT: srli a0, a3, 24
+; RV32I-NEXT: sb a0, 3(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: srli a0, t0, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, t0, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a0, t0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -823,6 +842,222 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
store i128 %res, ptr %dst, align 1
ret void
}
+
+define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_16bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 9(a0)
+; RV64I-NEXT: lbu a4, 8(a0)
+; RV64I-NEXT: lbu a5, 10(a0)
+; RV64I-NEXT: lbu a6, 11(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 13(a0)
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 14(a0)
+; RV64I-NEXT: lbu a7, 15(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a1)
+; RV64I-NEXT: lbu a5, 4(a1)
+; RV64I-NEXT: lbu a6, 6(a1)
+; RV64I-NEXT: lbu a7, 7(a1)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 1(a1)
+; RV64I-NEXT: lbu a6, 0(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
+; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 5
+; RV64I-NEXT: slli a4, a4, 37
+; RV64I-NEXT: or a5, a4, a1
+; RV64I-NEXT: addi a4, a5, -64
+; RV64I-NEXT: srl a1, a3, a5
+; RV64I-NEXT: bltz a4, .LBB7_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: mv a0, a1
+; RV64I-NEXT: j .LBB7_3
+; RV64I-NEXT: .LBB7_2:
+; RV64I-NEXT: lbu a6, 1(a0)
+; RV64I-NEXT: lbu a7, 0(a0)
+; RV64I-NEXT: lbu t0, 2(a0)
+; RV64I-NEXT: lbu t1, 3(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 5(a0)
+; RV64I-NEXT: lbu t0, 4(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: srl a0, a0, a5
+; RV64I-NEXT: not a5, a5
+; RV64I-NEXT: slli a3, a3, 1
+; RV64I-NEXT: sll a3, a3, a5
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: .LBB7_3:
+; RV64I-NEXT: srai a4, a4, 63
+; RV64I-NEXT: and a1, a4, a1
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 15(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 14(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 12(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 11(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 10(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 1(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_16bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: andi a1, a1, 12
+; RV32I-NEXT: mv a0, sp
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: lw a1, 8(a0)
+; RV32I-NEXT: lw a3, 12(a0)
+; RV32I-NEXT: lw a4, 0(a0)
+; RV32I-NEXT: lw a0, 4(a0)
+; RV32I-NEXT: sb a1, 8(a2)
+; RV32I-NEXT: sb a3, 12(a2)
+; RV32I-NEXT: sb a4, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a5, a1, 16
+; RV32I-NEXT: sb a5, 10(a2)
+; RV32I-NEXT: srli a5, a1, 24
+; RV32I-NEXT: sb a5, 11(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 9(a2)
+; RV32I-NEXT: srli a1, a3, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a3, 24
+; RV32I-NEXT: sb a1, 15(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a1, a4, 16
+; RV32I-NEXT: sb a1, 2(a2)
+; RV32I-NEXT: srli a1, a4, 24
+; RV32I-NEXT: sb a1, 3(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 1(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: ret
+ %src = load i128, ptr %src.ptr, align 1
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
+ %res = lshr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: shl_16bytes:
; RV64I: # %bb.0:
@@ -873,11 +1108,11 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, a4, a1
; RV64I-NEXT: addi a4, a5, -64
; RV64I-NEXT: sll a1, a3, a5
-; RV64I-NEXT: bltz a4, .LBB7_2
+; RV64I-NEXT: bltz a4, .LBB8_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: mv a0, a1
-; RV64I-NEXT: j .LBB7_3
-; RV64I-NEXT: .LBB7_2:
+; RV64I-NEXT: j .LBB8_3
+; RV64I-NEXT: .LBB8_2:
; RV64I-NEXT: lbu a6, 9(a0)
; RV64I-NEXT: lbu a7, 8(a0)
; RV64I-NEXT: lbu t0, 10(a0)
@@ -905,7 +1140,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: srli a3, a3, 1
; RV64I-NEXT: srl a3, a3, a5
; RV64I-NEXT: or a0, a0, a3
-; RV64I-NEXT: .LBB7_3:
+; RV64I-NEXT: .LBB8_3:
; RV64I-NEXT: srai a4, a4, 63
; RV64I-NEXT: and a1, a4, a1
; RV64I-NEXT: sb a1, 0(a2)
@@ -942,98 +1177,117 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: shl_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -48
-; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
; RV32I-NEXT: lbu a0, 15(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb zero, 19(sp)
-; RV32I-NEXT: sb zero, 18(sp)
-; RV32I-NEXT: sb zero, 17(sp)
-; RV32I-NEXT: sb zero, 16(sp)
-; RV32I-NEXT: sb zero, 15(sp)
-; RV32I-NEXT: sb zero, 14(sp)
-; RV32I-NEXT: sb zero, 13(sp)
-; RV32I-NEXT: sb zero, 12(sp)
-; RV32I-NEXT: sb zero, 11(sp)
-; RV32I-NEXT: sb zero, 10(sp)
-; RV32I-NEXT: sb zero, 9(sp)
-; RV32I-NEXT: sb zero, 8(sp)
-; RV32I-NEXT: sb zero, 7(sp)
-; RV32I-NEXT: sb zero, 6(sp)
-; RV32I-NEXT: sb zero, 5(sp)
-; RV32I-NEXT: sb zero, 4(sp)
-; RV32I-NEXT: sb a0, 35(sp)
-; RV32I-NEXT: sb s2, 34(sp)
-; RV32I-NEXT: sb s1, 33(sp)
-; RV32I-NEXT: sb s0, 32(sp)
-; RV32I-NEXT: sb t6, 31(sp)
-; RV32I-NEXT: sb t5, 30(sp)
-; RV32I-NEXT: sb t4, 29(sp)
-; RV32I-NEXT: sb t3, 28(sp)
-; RV32I-NEXT: sb t2, 27(sp)
-; RV32I-NEXT: sb t1, 26(sp)
-; RV32I-NEXT: sb t0, 25(sp)
-; RV32I-NEXT: sb a7, 24(sp)
-; RV32I-NEXT: sb a6, 23(sp)
-; RV32I-NEXT: sb a5, 22(sp)
-; RV32I-NEXT: sb a4, 21(sp)
-; RV32I-NEXT: sb a3, 20(sp)
-; RV32I-NEXT: andi a1, a1, 15
-; RV32I-NEXT: addi a0, sp, 20
-; RV32I-NEXT: sub a0, a0, a1
-; RV32I-NEXT: lbu a1, 5(a0)
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: lbu a4, 7(a0)
-; RV32I-NEXT: lbu a5, 6(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 12(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu t5, 14(a0)
-; RV32I-NEXT: lbu t6, 10(a0)
-; RV32I-NEXT: lbu s0, 11(a0)
-; RV32I-NEXT: lbu s1, 8(a0)
-; RV32I-NEXT: lbu a0, 9(a0)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb s1, 8(a2)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: andi a0, a1, 12
+; RV32I-NEXT: addi a3, sp, 16
+; RV32I-NEXT: sub a3, a3, a0
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: lw a4, 0(a3)
+; RV32I-NEXT: sll a5, a0, a1
+; RV32I-NEXT: andi a6, a1, 24
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: srli a7, a4, 1
+; RV32I-NEXT: lw t0, 12(a3)
+; RV32I-NEXT: lw a3, 8(a3)
+; RV32I-NEXT: srl a7, a7, a6
+; RV32I-NEXT: or a7, a5, a7
+; RV32I-NEXT: sll t0, t0, a1
+; RV32I-NEXT: srli t1, a3, 1
+; RV32I-NEXT: srl t1, t1, a6
+; RV32I-NEXT: or t1, t0, t1
+; RV32I-NEXT: sll a3, a3, a1
+; RV32I-NEXT: srli a0, a0, 1
+; RV32I-NEXT: srl a0, a0, a6
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: sll a1, a4, a1
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: srli a3, a3, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, t0, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a5, a5, 24
+; RV32I-NEXT: sb a5, 7(a2)
+; RV32I-NEXT: sb a0, 8(a2)
+; RV32I-NEXT: sb t1, 12(a2)
+; RV32I-NEXT: sb a7, 4(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 10(a2)
+; RV32I-NEXT: srli a0, a0, 8
; RV32I-NEXT: sb a0, 9(a2)
-; RV32I-NEXT: sb t5, 14(a2)
-; RV32I-NEXT: sb t4, 15(a2)
-; RV32I-NEXT: sb t3, 12(a2)
-; RV32I-NEXT: sb t2, 13(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
-; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a6, 1(a2)
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: sb a4, 7(a2)
-; RV32I-NEXT: sb a3, 4(a2)
-; RV32I-NEXT: sb a1, 5(a2)
-; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 48
+; RV32I-NEXT: srli a0, t1, 16
+; RV32I-NEXT: sb a0, 14(a2)
+; RV32I-NEXT: srli a0, t1, 8
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: srli a0, a7, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, a7, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -1042,6 +1296,223 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
store i128 %res, ptr %dst, align 1
ret void
}
+
+define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_16bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a1)
+; RV64I-NEXT: lbu a5, 4(a1)
+; RV64I-NEXT: lbu a6, 6(a1)
+; RV64I-NEXT: lbu a7, 7(a1)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 1(a1)
+; RV64I-NEXT: lbu a6, 0(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
+; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 5
+; RV64I-NEXT: slli a4, a4, 37
+; RV64I-NEXT: or a5, a4, a1
+; RV64I-NEXT: addi a4, a5, -64
+; RV64I-NEXT: sll a1, a3, a5
+; RV64I-NEXT: bltz a4, .LBB9_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: mv a0, a1
+; RV64I-NEXT: j .LBB9_3
+; RV64I-NEXT: .LBB9_2:
+; RV64I-NEXT: lbu a6, 9(a0)
+; RV64I-NEXT: lbu a7, 8(a0)
+; RV64I-NEXT: lbu t0, 10(a0)
+; RV64I-NEXT: lbu t1, 11(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 13(a0)
+; RV64I-NEXT: lbu t0, 12(a0)
+; RV64I-NEXT: lbu t1, 14(a0)
+; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: sll a0, a0, a5
+; RV64I-NEXT: not a5, a5
+; RV64I-NEXT: srli a3, a3, 1
+; RV64I-NEXT: srl a3, a3, a5
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: .LBB9_3:
+; RV64I-NEXT: srai a4, a4, 63
+; RV64I-NEXT: and a1, a4, a1
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 7(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 6(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 5(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 3(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_16bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: andi a1, a1, 12
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: lw a1, 8(a0)
+; RV32I-NEXT: lw a3, 12(a0)
+; RV32I-NEXT: lw a4, 0(a0)
+; RV32I-NEXT: lw a0, 4(a0)
+; RV32I-NEXT: sb a1, 8(a2)
+; RV32I-NEXT: sb a3, 12(a2)
+; RV32I-NEXT: sb a4, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a5, a1, 16
+; RV32I-NEXT: sb a5, 10(a2)
+; RV32I-NEXT: srli a5, a1, 24
+; RV32I-NEXT: sb a5, 11(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 9(a2)
+; RV32I-NEXT: srli a1, a3, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a3, 24
+; RV32I-NEXT: sb a1, 15(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a1, a4, 16
+; RV32I-NEXT: sb a1, 2(a2)
+; RV32I-NEXT: srli a1, a4, 24
+; RV32I-NEXT: sb a1, 3(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 1(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: ret
+ %src = load i128, ptr %src.ptr, align 1
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
+ %res = shl i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+
define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: ashr_16bytes:
; RV64I: # %bb.0:
@@ -1092,13 +1563,13 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, a5, a1
; RV64I-NEXT: addi a6, a5, -64
; RV64I-NEXT: sra a1, a3, a5
-; RV64I-NEXT: bltz a6, .LBB8_2
+; RV64I-NEXT: bltz a6, .LBB10_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: sraiw a3, a4, 31
; RV64I-NEXT: mv a0, a1
; RV64I-NEXT: mv a1, a3
-; RV64I-NEXT: j .LBB8_3
-; RV64I-NEXT: .LBB8_2:
+; RV64I-NEXT: j .LBB10_3
+; RV64I-NEXT: .LBB10_2:
; RV64I-NEXT: lbu a4, 1(a0)
; RV64I-NEXT: lbu a6, 0(a0)
; RV64I-NEXT: lbu a7, 2(a0)
@@ -1126,7 +1597,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: slli a3, a3, 1
; RV64I-NEXT: sll a3, a3, a4
; RV64I-NEXT: or a0, a0, a3
-; RV64I-NEXT: .LBB8_3:
+; RV64I-NEXT: .LBB10_3:
; RV64I-NEXT: sb a1, 8(a2)
; RV64I-NEXT: srli a3, a1, 56
; RV64I-NEXT: sb a3, 15(a2)
@@ -1161,105 +1632,118 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: ashr_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -48
-; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 15(a0)
-; RV32I-NEXT: slli a4, a3, 24
-; RV32I-NEXT: lbu a5, 0(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 2(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 4(a0)
-; RV32I-NEXT: lbu t2, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t4, 7(a0)
-; RV32I-NEXT: lbu t5, 8(a0)
-; RV32I-NEXT: lbu t6, 9(a0)
-; RV32I-NEXT: lbu s0, 10(a0)
-; RV32I-NEXT: lbu s1, 11(a0)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 14(a0)
-; RV32I-NEXT: lbu a0, 13(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb a3, 15(sp)
-; RV32I-NEXT: sb s3, 14(sp)
-; RV32I-NEXT: sb a0, 13(sp)
-; RV32I-NEXT: sb s2, 12(sp)
-; RV32I-NEXT: sb s1, 11(sp)
-; RV32I-NEXT: sb s0, 10(sp)
-; RV32I-NEXT: sb t6, 9(sp)
-; RV32I-NEXT: sb t5, 8(sp)
-; RV32I-NEXT: sb t4, 7(sp)
-; RV32I-NEXT: sb t3, 6(sp)
-; RV32I-NEXT: sb t2, 5(sp)
-; RV32I-NEXT: sb t1, 4(sp)
-; RV32I-NEXT: sb t0, 3(sp)
-; RV32I-NEXT: sb a7, 2(sp)
-; RV32I-NEXT: sb a6, 1(sp)
-; RV32I-NEXT: sb a5, 0(sp)
-; RV32I-NEXT: srai a4, a4, 31
-; RV32I-NEXT: sb a4, 28(sp)
-; RV32I-NEXT: sb a4, 24(sp)
-; RV32I-NEXT: sb a4, 20(sp)
-; RV32I-NEXT: sb a4, 16(sp)
-; RV32I-NEXT: srli a0, a4, 24
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: srli a3, a4, 16
-; RV32I-NEXT: sb a3, 30(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 29(sp)
-; RV32I-NEXT: sb a0, 27(sp)
-; RV32I-NEXT: sb a3, 26(sp)
-; RV32I-NEXT: sb a4, 25(sp)
-; RV32I-NEXT: sb a0, 23(sp)
-; RV32I-NEXT: sb a3, 22(sp)
-; RV32I-NEXT: sb a4, 21(sp)
-; RV32I-NEXT: sb a0, 19(sp)
-; RV32I-NEXT: sb a3, 18(sp)
-; RV32I-NEXT: sb a4, 17(sp)
-; RV32I-NEXT: andi a1, a1, 15
-; RV32I-NEXT: mv a0, sp
-; RV32I-NEXT: add a0, a0, a1
-; RV32I-NEXT: lbu a1, 5(a0)
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: lbu a4, 7(a0)
-; RV32I-NEXT: lbu a5, 6(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 12(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu t5, 14(a0)
-; RV32I-NEXT: lbu t6, 10(a0)
-; RV32I-NEXT: lbu s0, 11(a0)
-; RV32I-NEXT: lbu s1, 8(a0)
-; RV32I-NEXT: lbu a0, 9(a0)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb a0, 9(a2)
-; RV32I-NEXT: sb t5, 14(a2)
-; RV32I-NEXT: sb t4, 15(a2)
-; RV32I-NEXT: sb t3, 12(a2)
-; RV32I-NEXT: sb t2, 13(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a7, a0, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t1, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: andi a0, a1, 12
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a0, a3, a0
+; RV32I-NEXT: lw a3, 4(a0)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: srl a4, a3, a1
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: andi a6, a1, 24
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 0(a0)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or t0, a4, t0
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: sll a3, a3, a6
+; RV32I-NEXT: or a3, a7, a3
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli t1, a0, 1
+; RV32I-NEXT: sll a6, t1, a6
+; RV32I-NEXT: or a6, a5, a6
+; RV32I-NEXT: sra a0, a0, a1
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a6, 1(a2)
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: sb a4, 7(a2)
-; RV32I-NEXT: sb a3, 4(a2)
-; RV32I-NEXT: sb a1, 5(a2)
-; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 48
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 15(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: srli a0, a6, 16
+; RV32I-NEXT: sb a0, 10(a2)
+; RV32I-NEXT: srli a0, a6, 24
+; RV32I-NEXT: sb a0, 11(a2)
+; RV32I-NEXT: srli a0, a6, 8
+; RV32I-NEXT: sb a0, 9(a2)
+; RV32I-NEXT: srli a0, a3, 16
+; RV32I-NEXT: sb a0, 2(a2)
+; RV32I-NEXT: srli a0, a3, 24
+; RV32I-NEXT: sb a0, 3(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: srli a0, t0, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, t0, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a0, t0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -1269,1347 +1753,3730 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
ret void
}
-define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; RV64I-LABEL: lshr_32bytes:
-; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: lbu s11, 23(a0)
-; RV64I-NEXT: lbu ra, 24(a0)
-; RV64I-NEXT: lbu t0, 25(a0)
-; RV64I-NEXT: lbu a7, 26(a0)
-; RV64I-NEXT: lbu a6, 27(a0)
-; RV64I-NEXT: lbu a5, 28(a0)
-; RV64I-NEXT: lbu a3, 31(a0)
-; RV64I-NEXT: lbu a4, 30(a0)
-; RV64I-NEXT: lbu a0, 29(a0)
-; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: sb a3, 87(sp)
-; RV64I-NEXT: sb a4, 86(sp)
-; RV64I-NEXT: sb a0, 85(sp)
-; RV64I-NEXT: sb a5, 84(sp)
-; RV64I-NEXT: sb a6, 83(sp)
-; RV64I-NEXT: sb a7, 82(sp)
-; RV64I-NEXT: sb zero, 119(sp)
-; RV64I-NEXT: sb zero, 118(sp)
-; RV64I-NEXT: sb zero, 117(sp)
-; RV64I-NEXT: sb zero, 116(sp)
-; RV64I-NEXT: sb zero, 115(sp)
-; RV64I-NEXT: sb zero, 114(sp)
-; RV64I-NEXT: sb zero, 113(sp)
-; RV64I-NEXT: sb zero, 112(sp)
-; RV64I-NEXT: sb zero, 111(sp)
-; RV64I-NEXT: sb zero, 110(sp)
-; RV64I-NEXT: sb zero, 109(sp)
-; RV64I-NEXT: sb zero, 108(sp)
-; RV64I-NEXT: sb zero, 107(sp)
-; RV64I-NEXT: sb zero, 106(sp)
-; RV64I-NEXT: sb zero, 105(sp)
-; RV64I-NEXT: sb zero, 104(sp)
-; RV64I-NEXT: sb zero, 103(sp)
-; RV64I-NEXT: sb zero, 102(sp)
-; RV64I-NEXT: sb zero, 101(sp)
-; RV64I-NEXT: sb zero, 100(sp)
-; RV64I-NEXT: sb zero, 99(sp)
-; RV64I-NEXT: sb zero, 98(sp)
-; RV64I-NEXT: sb zero, 97(sp)
-; RV64I-NEXT: sb zero, 96(sp)
-; RV64I-NEXT: sb zero, 95(sp)
-; RV64I-NEXT: sb zero, 94(sp)
-; RV64I-NEXT: sb zero, 93(sp)
-; RV64I-NEXT: sb zero, 92(sp)
-; RV64I-NEXT: sb zero, 91(sp)
-; RV64I-NEXT: sb zero, 90(sp)
-; RV64I-NEXT: sb zero, 89(sp)
-; RV64I-NEXT: sb zero, 88(sp)
-; RV64I-NEXT: sb t0, 81(sp)
-; RV64I-NEXT: sb ra, 80(sp)
-; RV64I-NEXT: sb s11, 79(sp)
-; RV64I-NEXT: sb s10, 78(sp)
-; RV64I-NEXT: sb s9, 77(sp)
-; RV64I-NEXT: sb s8, 76(sp)
-; RV64I-NEXT: sb s7, 75(sp)
-; RV64I-NEXT: sb s6, 74(sp)
-; RV64I-NEXT: sb s5, 73(sp)
-; RV64I-NEXT: sb s4, 72(sp)
-; RV64I-NEXT: sb s3, 71(sp)
-; RV64I-NEXT: sb s2, 70(sp)
-; RV64I-NEXT: sb s1, 69(sp)
-; RV64I-NEXT: sb s0, 68(sp)
-; RV64I-NEXT: sb t6, 67(sp)
-; RV64I-NEXT: sb t5, 66(sp)
-; RV64I-NEXT: sb t4, 65(sp)
-; RV64I-NEXT: sb t3, 64(sp)
-; RV64I-NEXT: sb t2, 63(sp)
-; RV64I-NEXT: sb t1, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: andi a1, a1, 31
-; RV64I-NEXT: addi a0, sp, 56
-; RV64I-NEXT: add a6, a0, a1
-; RV64I-NEXT: lbu a0, 8(a6)
-; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 9(a6)
-; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 10(a6)
-; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 11(a6)
-; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 12(a6)
-; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a7, 13(a6)
-; RV64I-NEXT: lbu t0, 14(a6)
-; RV64I-NEXT: lbu t1, 15(a6)
-; RV64I-NEXT: lbu t2, 0(a6)
-; RV64I-NEXT: lbu t3, 1(a6)
-; RV64I-NEXT: lbu t4, 2(a6)
-; RV64I-NEXT: lbu t5, 3(a6)
-; RV64I-NEXT: lbu t6, 4(a6)
-; RV64I-NEXT: lbu s0, 5(a6)
-; RV64I-NEXT: lbu s1, 6(a6)
-; RV64I-NEXT: lbu s2, 7(a6)
-; RV64I-NEXT: lbu s3, 24(a6)
-; RV64I-NEXT: lbu s4, 25(a6)
-; RV64I-NEXT: lbu s5, 26(a6)
-; RV64I-NEXT: lbu s6, 27(a6)
-; RV64I-NEXT: lbu s7, 28(a6)
-; RV64I-NEXT: lbu s8, 29(a6)
-; RV64I-NEXT: lbu s9, 30(a6)
-; RV64I-NEXT: lbu s10, 31(a6)
-; RV64I-NEXT: lbu s11, 16(a6)
-; RV64I-NEXT: lbu ra, 17(a6)
-; RV64I-NEXT: lbu a5, 18(a6)
-; RV64I-NEXT: lbu a4, 19(a6)
-; RV64I-NEXT: lbu a0, 23(a6)
-; RV64I-NEXT: lbu a1, 22(a6)
-; RV64I-NEXT: lbu a3, 21(a6)
-; RV64I-NEXT: lbu a6, 20(a6)
-; RV64I-NEXT: sb a0, 23(a2)
-; RV64I-NEXT: sb a1, 22(a2)
-; RV64I-NEXT: sb a3, 21(a2)
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: sb a5, 18(a2)
-; RV64I-NEXT: sb ra, 17(a2)
-; RV64I-NEXT: sb s11, 16(a2)
-; RV64I-NEXT: sb s10, 31(a2)
-; RV64I-NEXT: sb s9, 30(a2)
-; RV64I-NEXT: sb s8, 29(a2)
-; RV64I-NEXT: sb s7, 28(a2)
-; RV64I-NEXT: sb s6, 27(a2)
-; RV64I-NEXT: sb s5, 26(a2)
-; RV64I-NEXT: sb s4, 25(a2)
-; RV64I-NEXT: sb s3, 24(a2)
-; RV64I-NEXT: sb s2, 7(a2)
-; RV64I-NEXT: sb s1, 6(a2)
-; RV64I-NEXT: sb s0, 5(a2)
-; RV64I-NEXT: sb t6, 4(a2)
-; RV64I-NEXT: sb t5, 3(a2)
-; RV64I-NEXT: sb t4, 2(a2)
-; RV64I-NEXT: sb t3, 1(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t1, 15(a2)
-; RV64I-NEXT: sb t0, 14(a2)
-; RV64I-NEXT: sb a7, 13(a2)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 12(a2)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 11(a2)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 10(a2)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 9(a2)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
-; RV64I-NEXT: ret
-;
-; RV32I-LABEL: lshr_32bytes:
-; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s10, 22(a0)
-; RV32I-NEXT: lbu s11, 23(a0)
-; RV32I-NEXT: lbu ra, 24(a0)
-; RV32I-NEXT: lbu t0, 25(a0)
-; RV32I-NEXT: lbu a7, 26(a0)
-; RV32I-NEXT: lbu a6, 27(a0)
-; RV32I-NEXT: lbu a5, 28(a0)
-; RV32I-NEXT: lbu a3, 31(a0)
-; RV32I-NEXT: lbu a4, 30(a0)
-; RV32I-NEXT: lbu a0, 29(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb a3, 59(sp)
-; RV32I-NEXT: sb a4, 58(sp)
-; RV32I-NEXT: sb a0, 57(sp)
-; RV32I-NEXT: sb a5, 56(sp)
-; RV32I-NEXT: sb a6, 55(sp)
-; RV32I-NEXT: sb a7, 54(sp)
-; RV32I-NEXT: sb zero, 91(sp)
-; RV32I-NEXT: sb zero, 90(sp)
-; RV32I-NEXT: sb zero, 89(sp)
-; RV32I-NEXT: sb zero, 88(sp)
-; RV32I-NEXT: sb zero, 87(sp)
-; RV32I-NEXT: sb zero, 86(sp)
-; RV32I-NEXT: sb zero, 85(sp)
-; RV32I-NEXT: sb zero, 84(sp)
-; RV32I-NEXT: sb zero, 83(sp)
-; RV32I-NEXT: sb zero, 82(sp)
-; RV32I-NEXT: sb zero, 81(sp)
-; RV32I-NEXT: sb zero, 80(sp)
-; RV32I-NEXT: sb zero, 79(sp)
-; RV32I-NEXT: sb zero, 78(sp)
-; RV32I-NEXT: sb zero, 77(sp)
-; RV32I-NEXT: sb zero, 76(sp)
-; RV32I-NEXT: sb zero, 75(sp)
-; RV32I-NEXT: sb zero, 74(sp)
-; RV32I-NEXT: sb zero, 73(sp)
-; RV32I-NEXT: sb zero, 72(sp)
-; RV32I-NEXT: sb zero, 71(sp)
-; RV32I-NEXT: sb zero, 70(sp)
-; RV32I-NEXT: sb zero, 69(sp)
-; RV32I-NEXT: sb zero, 68(sp)
-; RV32I-NEXT: sb zero, 67(sp)
-; RV32I-NEXT: sb zero, 66(sp)
-; RV32I-NEXT: sb zero, 65(sp)
-; RV32I-NEXT: sb zero, 64(sp)
-; RV32I-NEXT: sb zero, 63(sp)
-; RV32I-NEXT: sb zero, 62(sp)
-; RV32I-NEXT: sb zero, 61(sp)
-; RV32I-NEXT: sb zero, 60(sp)
-; RV32I-NEXT: sb t0, 53(sp)
-; RV32I-NEXT: sb ra, 52(sp)
-; RV32I-NEXT: sb s11, 51(sp)
-; RV32I-NEXT: sb s10, 50(sp)
-; RV32I-NEXT: sb s9, 49(sp)
-; RV32I-NEXT: sb s8, 48(sp)
-; RV32I-NEXT: sb s7, 47(sp)
-; RV32I-NEXT: sb s6, 46(sp)
-; RV32I-NEXT: sb s5, 45(sp)
-; RV32I-NEXT: sb s4, 44(sp)
-; RV32I-NEXT: sb s3, 43(sp)
-; RV32I-NEXT: sb s2, 42(sp)
-; RV32I-NEXT: sb s1, 41(sp)
-; RV32I-NEXT: sb s0, 40(sp)
-; RV32I-NEXT: sb t6, 39(sp)
-; RV32I-NEXT: sb t5, 38(sp)
-; RV32I-NEXT: sb t4, 37(sp)
-; RV32I-NEXT: sb t3, 36(sp)
-; RV32I-NEXT: sb t2, 35(sp)
-; RV32I-NEXT: sb t1, 34(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 33(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 32(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 30(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 29(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 28(sp)
-; RV32I-NEXT: andi a1, a1, 31
-; RV32I-NEXT: addi a0, sp, 28
-; RV32I-NEXT: add a6, a0, a1
-; RV32I-NEXT: lbu a0, 6(a6)
-; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 7(a6)
-; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 4(a6)
-; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 5(a6)
-; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 0(a6)
-; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a7, 1(a6)
-; RV32I-NEXT: lbu t0, 2(a6)
-; RV32I-NEXT: lbu t1, 3(a6)
-; RV32I-NEXT: lbu t2, 14(a6)
-; RV32I-NEXT: lbu t3, 15(a6)
-; RV32I-NEXT: lbu t4, 12(a6)
-; RV32I-NEXT: lbu t5, 13(a6)
-; RV32I-NEXT: lbu t6, 10(a6)
-; RV32I-NEXT: lbu s0, 11(a6)
-; RV32I-NEXT: lbu s1, 8(a6)
-; RV32I-NEXT: lbu s2, 9(a6)
-; RV32I-NEXT: lbu s3, 22(a6)
-; RV32I-NEXT: lbu s4, 23(a6)
-; RV32I-NEXT: lbu s5, 20(a6)
-; RV32I-NEXT: lbu s6, 21(a6)
-; RV32I-NEXT: lbu s7, 18(a6)
-; RV32I-NEXT: lbu s8, 19(a6)
-; RV32I-NEXT: lbu s9, 16(a6)
-; RV32I-NEXT: lbu s10, 17(a6)
-; RV32I-NEXT: lbu s11, 30(a6)
-; RV32I-NEXT: lbu ra, 31(a6)
-; RV32I-NEXT: lbu a5, 28(a6)
-; RV32I-NEXT: lbu a4, 29(a6)
-; RV32I-NEXT: lbu a0, 25(a6)
-; RV32I-NEXT: lbu a1, 24(a6)
-; RV32I-NEXT: lbu a3, 27(a6)
-; RV32I-NEXT: lbu a6, 26(a6)
-; RV32I-NEXT: sb a0, 25(a2)
-; RV32I-NEXT: sb a1, 24(a2)
-; RV32I-NEXT: sb a3, 27(a2)
-; RV32I-NEXT: sb a6, 26(a2)
-; RV32I-NEXT: sb a4, 29(a2)
-; RV32I-NEXT: sb a5, 28(a2)
-; RV32I-NEXT: sb ra, 31(a2)
-; RV32I-NEXT: sb s11, 30(a2)
-; RV32I-NEXT: sb s10, 17(a2)
-; RV32I-NEXT: sb s9, 16(a2)
-; RV32I-NEXT: sb s8, 19(a2)
-; RV32I-NEXT: sb s7, 18(a2)
-; RV32I-NEXT: sb s6, 21(a2)
-; RV32I-NEXT: sb s5, 20(a2)
-; RV32I-NEXT: sb s4, 23(a2)
-; RV32I-NEXT: sb s3, 22(a2)
-; RV32I-NEXT: sb s2, 9(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb t5, 13(a2)
-; RV32I-NEXT: sb t4, 12(a2)
-; RV32I-NEXT: sb t3, 15(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t1, 3(a2)
-; RV32I-NEXT: sb t0, 2(a2)
-; RV32I-NEXT: sb a7, 1(a2)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 0(a2)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 6(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
-; RV32I-NEXT: ret
- %src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
- %res = lshr i256 %src, %bitOff
- store i256 %res, ptr %dst, align 1
- ret void
-}
-define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; RV64I-LABEL: shl_32bytes:
+define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_16bytes_wordOff:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: lbu s11, 23(a0)
-; RV64I-NEXT: lbu ra, 24(a0)
-; RV64I-NEXT: lbu t0, 25(a0)
-; RV64I-NEXT: lbu a7, 26(a0)
-; RV64I-NEXT: lbu a6, 27(a0)
-; RV64I-NEXT: lbu a5, 28(a0)
-; RV64I-NEXT: lbu a3, 31(a0)
-; RV64I-NEXT: lbu a4, 30(a0)
-; RV64I-NEXT: lbu a0, 29(a0)
-; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: sb a3, 119(sp)
-; RV64I-NEXT: sb a4, 118(sp)
-; RV64I-NEXT: sb a0, 117(sp)
-; RV64I-NEXT: sb a5, 116(sp)
-; RV64I-NEXT: sb a6, 115(sp)
-; RV64I-NEXT: sb a7, 114(sp)
-; RV64I-NEXT: sb zero, 87(sp)
-; RV64I-NEXT: sb zero, 86(sp)
-; RV64I-NEXT: sb zero, 85(sp)
-; RV64I-NEXT: sb zero, 84(sp)
-; RV64I-NEXT: sb zero, 83(sp)
-; RV64I-NEXT: sb zero, 82(sp)
-; RV64I-NEXT: sb zero, 81(sp)
-; RV64I-NEXT: sb zero, 80(sp)
-; RV64I-NEXT: sb zero, 79(sp)
-; RV64I-NEXT: sb zero, 78(sp)
-; RV64I-NEXT: sb zero, 77(sp)
-; RV64I-NEXT: sb zero, 76(sp)
-; RV64I-NEXT: sb zero, 75(sp)
-; RV64I-NEXT: sb zero, 74(sp)
-; RV64I-NEXT: sb zero, 73(sp)
-; RV64I-NEXT: sb zero, 72(sp)
-; RV64I-NEXT: sb zero, 71(sp)
-; RV64I-NEXT: sb zero, 70(sp)
-; RV64I-NEXT: sb zero, 69(sp)
-; RV64I-NEXT: sb zero, 68(sp)
-; RV64I-NEXT: sb zero, 67(sp)
-; RV64I-NEXT: sb zero, 66(sp)
-; RV64I-NEXT: sb zero, 65(sp)
-; RV64I-NEXT: sb zero, 64(sp)
-; RV64I-NEXT: sb zero, 63(sp)
-; RV64I-NEXT: sb zero, 62(sp)
-; RV64I-NEXT: sb zero, 61(sp)
-; RV64I-NEXT: sb zero, 60(sp)
-; RV64I-NEXT: sb zero, 59(sp)
-; RV64I-NEXT: sb zero, 58(sp)
-; RV64I-NEXT: sb zero, 57(sp)
-; RV64I-NEXT: sb zero, 56(sp)
-; RV64I-NEXT: sb t0, 113(sp)
-; RV64I-NEXT: sb ra, 112(sp)
-; RV64I-NEXT: sb s11, 111(sp)
-; RV64I-NEXT: sb s10, 110(sp)
-; RV64I-NEXT: sb s9, 109(sp)
-; RV64I-NEXT: sb s8, 108(sp)
-; RV64I-NEXT: sb s7, 107(sp)
-; RV64I-NEXT: sb s6, 106(sp)
-; RV64I-NEXT: sb s5, 105(sp)
-; RV64I-NEXT: sb s4, 104(sp)
-; RV64I-NEXT: sb s3, 103(sp)
-; RV64I-NEXT: sb s2, 102(sp)
-; RV64I-NEXT: sb s1, 101(sp)
-; RV64I-NEXT: sb s0, 100(sp)
-; RV64I-NEXT: sb t6, 99(sp)
-; RV64I-NEXT: sb t5, 98(sp)
-; RV64I-NEXT: sb t4, 97(sp)
-; RV64I-NEXT: sb t3, 96(sp)
-; RV64I-NEXT: sb t2, 95(sp)
-; RV64I-NEXT: sb t1, 94(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 93(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 92(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 91(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 90(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 88(sp)
-; RV64I-NEXT: andi a1, a1, 31
-; RV64I-NEXT: addi a0, sp, 88
-; RV64I-NEXT: sub a6, a0, a1
-; RV64I-NEXT: lbu a0, 8(a6)
-; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 9(a6)
-; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 10(a6)
-; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 11(a6)
-; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 12(a6)
-; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a7, 13(a6)
-; RV64I-NEXT: lbu t0, 14(a6)
-; RV64I-NEXT: lbu t1, 15(a6)
-; RV64I-NEXT: lbu t2, 0(a6)
-; RV64I-NEXT: lbu t3, 1(a6)
-; RV64I-NEXT: lbu t4, 2(a6)
-; RV64I-NEXT: lbu t5, 3(a6)
-; RV64I-NEXT: lbu t6, 4(a6)
-; RV64I-NEXT: lbu s0, 5(a6)
-; RV64I-NEXT: lbu s1, 6(a6)
-; RV64I-NEXT: lbu s2, 7(a6)
-; RV64I-NEXT: lbu s3, 24(a6)
-; RV64I-NEXT: lbu s4, 25(a6)
-; RV64I-NEXT: lbu s5, 26(a6)
-; RV64I-NEXT: lbu s6, 27(a6)
-; RV64I-NEXT: lbu s7, 28(a6)
-; RV64I-NEXT: lbu s8, 29(a6)
-; RV64I-NEXT: lbu s9, 30(a6)
-; RV64I-NEXT: lbu s10, 31(a6)
-; RV64I-NEXT: lbu s11, 16(a6)
-; RV64I-NEXT: lbu ra, 17(a6)
-; RV64I-NEXT: lbu a5, 18(a6)
-; RV64I-NEXT: lbu a4, 19(a6)
-; RV64I-NEXT: lbu a0, 23(a6)
-; RV64I-NEXT: lbu a1, 22(a6)
-; RV64I-NEXT: lbu a3, 21(a6)
-; RV64I-NEXT: lbu a6, 20(a6)
-; RV64I-NEXT: sb a0, 23(a2)
-; RV64I-NEXT: sb a1, 22(a2)
-; RV64I-NEXT: sb a3, 21(a2)
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: sb a5, 18(a2)
-; RV64I-NEXT: sb ra, 17(a2)
-; RV64I-NEXT: sb s11, 16(a2)
-; RV64I-NEXT: sb s10, 31(a2)
-; RV64I-NEXT: sb s9, 30(a2)
-; RV64I-NEXT: sb s8, 29(a2)
-; RV64I-NEXT: sb s7, 28(a2)
-; RV64I-NEXT: sb s6, 27(a2)
-; RV64I-NEXT: sb s5, 26(a2)
-; RV64I-NEXT: sb s4, 25(a2)
-; RV64I-NEXT: sb s3, 24(a2)
-; RV64I-NEXT: sb s2, 7(a2)
-; RV64I-NEXT: sb s1, 6(a2)
-; RV64I-NEXT: sb s0, 5(a2)
-; RV64I-NEXT: sb t6, 4(a2)
-; RV64I-NEXT: sb t5, 3(a2)
-; RV64I-NEXT: sb t4, 2(a2)
-; RV64I-NEXT: sb t3, 1(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t1, 15(a2)
-; RV64I-NEXT: sb t0, 14(a2)
-; RV64I-NEXT: sb a7, 13(a2)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: lbu a3, 9(a0)
+; RV64I-NEXT: lbu a4, 8(a0)
+; RV64I-NEXT: lbu a5, 10(a0)
+; RV64I-NEXT: lbu a6, 11(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 13(a0)
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 14(a0)
+; RV64I-NEXT: lbu a7, 15(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a5, a4, 32
+; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: lbu a5, 5(a1)
+; RV64I-NEXT: lbu a6, 4(a1)
+; RV64I-NEXT: lbu a7, 6(a1)
+; RV64I-NEXT: lbu t0, 7(a1)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t0
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a1, a1, 5
+; RV64I-NEXT: slli a5, a5, 37
+; RV64I-NEXT: or a5, a5, a1
+; RV64I-NEXT: addi a6, a5, -64
+; RV64I-NEXT: sra a1, a3, a5
+; RV64I-NEXT: bltz a6, .LBB11_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sraiw a3, a4, 31
+; RV64I-NEXT: mv a0, a1
+; RV64I-NEXT: mv a1, a3
+; RV64I-NEXT: j .LBB11_3
+; RV64I-NEXT: .LBB11_2:
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a6, 0(a0)
+; RV64I-NEXT: lbu a7, 2(a0)
+; RV64I-NEXT: lbu t0, 3(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a4, a6, a4
+; RV64I-NEXT: lbu a6, 5(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 6(a0)
+; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a4
+; RV64I-NEXT: srl a0, a0, a5
+; RV64I-NEXT: not a4, a5
+; RV64I-NEXT: slli a3, a3, 1
+; RV64I-NEXT: sll a3, a3, a4
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: .LBB11_3:
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 15(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 14(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 12(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 11(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 10(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 1(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: ashr_16bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a7, a0, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: andi a1, a1, 12
+; RV32I-NEXT: mv a0, sp
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: lw a1, 8(a0)
+; RV32I-NEXT: lw a3, 12(a0)
+; RV32I-NEXT: lw a4, 0(a0)
+; RV32I-NEXT: lw a0, 4(a0)
+; RV32I-NEXT: sb a1, 8(a2)
+; RV32I-NEXT: sb a3, 12(a2)
+; RV32I-NEXT: sb a4, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a5, a1, 16
+; RV32I-NEXT: sb a5, 10(a2)
+; RV32I-NEXT: srli a5, a1, 24
+; RV32I-NEXT: sb a5, 11(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 9(a2)
+; RV32I-NEXT: srli a1, a3, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a3, 24
+; RV32I-NEXT: sb a1, 15(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a1, a4, 16
+; RV32I-NEXT: sb a1, 2(a2)
+; RV32I-NEXT: srli a1, a4, 24
+; RV32I-NEXT: sb a1, 3(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 1(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: ret
+ %src = load i128, ptr %src.ptr, align 1
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
+ %res = ashr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 56(sp)
+; RV64I-NEXT: sd zero, 48(sp)
+; RV64I-NEXT: sd zero, 40(sp)
+; RV64I-NEXT: sd zero, 32(sp)
+; RV64I-NEXT: sd a0, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: andi a0, a1, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: srl a5, a4, a1
+; RV64I-NEXT: ld a6, 16(a3)
+; RV64I-NEXT: andi a0, a1, 56
+; RV64I-NEXT: xori a7, a0, 63
+; RV64I-NEXT: ld t0, 0(a3)
+; RV64I-NEXT: slli a0, a6, 1
+; RV64I-NEXT: sll a0, a0, a7
+; RV64I-NEXT: or a0, a5, a0
+; RV64I-NEXT: srl t0, t0, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a7
+; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: srl a6, a6, a1
+; RV64I-NEXT: slli t1, a3, 1
+; RV64I-NEXT: sll a7, t1, a7
+; RV64I-NEXT: or a7, a6, a7
+; RV64I-NEXT: srl a1, a3, a1
+; RV64I-NEXT: sb a6, 16(a2)
+; RV64I-NEXT: sb a1, 24(a2)
+; RV64I-NEXT: sb t0, 0(a2)
+; RV64I-NEXT: sb a5, 8(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 30(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 28(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 27(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 26(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: srli a1, a7, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a7, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a7, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a7, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a7, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a7, 16
+; RV64I-NEXT: sb a1, 18(a2)
+; RV64I-NEXT: srli a1, a7, 8
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 1(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_32bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -80
+; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu t2, 1(a1)
+; RV32I-NEXT: lbu t3, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: or a1, a1, t2
+; RV32I-NEXT: sw zero, 60(sp)
+; RV32I-NEXT: sw zero, 56(sp)
+; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 44(sp)
+; RV32I-NEXT: sw zero, 40(sp)
+; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: andi a0, a1, 28
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a5, a3, a0
+; RV32I-NEXT: lw a3, 4(a5)
+; RV32I-NEXT: slli a6, a1, 3
+; RV32I-NEXT: srl a4, a3, a6
+; RV32I-NEXT: lw a7, 8(a5)
+; RV32I-NEXT: andi a0, a6, 24
+; RV32I-NEXT: xori t0, a0, 31
+; RV32I-NEXT: lw a1, 0(a5)
+; RV32I-NEXT: slli a0, a7, 1
+; RV32I-NEXT: sll a0, a0, t0
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: srl t1, a1, a6
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw t2, 12(a5)
+; RV32I-NEXT: lw t3, 16(a5)
+; RV32I-NEXT: sll a1, a3, t0
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: srl t4, t2, a6
+; RV32I-NEXT: slli a3, t3, 1
+; RV32I-NEXT: sll a3, a3, t0
+; RV32I-NEXT: or a3, t4, a3
+; RV32I-NEXT: srl a7, a7, a6
+; RV32I-NEXT: slli t2, t2, 1
+; RV32I-NEXT: lw t5, 20(a5)
+; RV32I-NEXT: lw t6, 24(a5)
+; RV32I-NEXT: sll t2, t2, t0
+; RV32I-NEXT: or t2, a7, t2
+; RV32I-NEXT: srl s0, t5, a6
+; RV32I-NEXT: slli s1, t6, 1
+; RV32I-NEXT: sll s1, s1, t0
+; RV32I-NEXT: or s1, s0, s1
+; RV32I-NEXT: srl t3, t3, a6
+; RV32I-NEXT: slli t5, t5, 1
+; RV32I-NEXT: lw a5, 28(a5)
+; RV32I-NEXT: sll t5, t5, t0
+; RV32I-NEXT: or t5, t3, t5
+; RV32I-NEXT: srl t6, t6, a6
+; RV32I-NEXT: slli s2, a5, 1
+; RV32I-NEXT: sll t0, s2, t0
+; RV32I-NEXT: or t0, t6, t0
+; RV32I-NEXT: srl a5, a5, a6
+; RV32I-NEXT: sb t6, 24(a2)
+; RV32I-NEXT: sb a5, 28(a2)
+; RV32I-NEXT: sb t3, 16(a2)
+; RV32I-NEXT: sb s0, 20(a2)
+; RV32I-NEXT: sb a7, 8(a2)
+; RV32I-NEXT: sb t4, 12(a2)
+; RV32I-NEXT: sb t1, 0(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a4, a5, 24
+; RV32I-NEXT: sb a4, 31(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 30(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 29(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 27(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, t5, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t5, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t5, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, s1, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, s1, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli s1, s1, 8
+; RV32I-NEXT: sb s1, 21(a2)
+; RV32I-NEXT: srli a4, t2, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, t2, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a4, t2, 8
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 15(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 14(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 80
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 56(sp)
+; RV64I-NEXT: sd zero, 48(sp)
+; RV64I-NEXT: sd zero, 40(sp)
+; RV64I-NEXT: sd zero, 32(sp)
+; RV64I-NEXT: sd a0, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: slli a0, a1, 2
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: slli a5, a1, 5
+; RV64I-NEXT: srl a1, a4, a5
+; RV64I-NEXT: ld a6, 16(a3)
+; RV64I-NEXT: andi a0, a5, 32
+; RV64I-NEXT: xori a7, a0, 63
+; RV64I-NEXT: ld t0, 0(a3)
+; RV64I-NEXT: slli a0, a6, 1
+; RV64I-NEXT: sll a0, a0, a7
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srl t0, t0, a5
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a7
+; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: srl a6, a6, a5
+; RV64I-NEXT: slli t1, a3, 1
+; RV64I-NEXT: sll a7, t1, a7
+; RV64I-NEXT: or a7, a6, a7
+; RV64I-NEXT: srl a3, a3, a5
+; RV64I-NEXT: sb a6, 16(a2)
+; RV64I-NEXT: sb a3, 24(a2)
+; RV64I-NEXT: sb t0, 0(a2)
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: srli a5, a6, 24
+; RV64I-NEXT: sb a5, 19(a2)
+; RV64I-NEXT: srli a5, a6, 16
+; RV64I-NEXT: sb a5, 18(a2)
+; RV64I-NEXT: srli a5, a6, 8
+; RV64I-NEXT: sb a5, 17(a2)
+; RV64I-NEXT: srli a5, a3, 56
+; RV64I-NEXT: sb a5, 31(a2)
+; RV64I-NEXT: srli a5, a3, 48
+; RV64I-NEXT: sb a5, 30(a2)
+; RV64I-NEXT: srli a5, a3, 40
+; RV64I-NEXT: sb a5, 29(a2)
+; RV64I-NEXT: srli a5, a3, 32
+; RV64I-NEXT: sb a5, 28(a2)
+; RV64I-NEXT: srli a5, a3, 24
+; RV64I-NEXT: sb a5, 27(a2)
+; RV64I-NEXT: srli a5, a3, 16
+; RV64I-NEXT: sb a5, 26(a2)
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a3, 25(a2)
+; RV64I-NEXT: srli a3, t0, 24
+; RV64I-NEXT: sb a3, 3(a2)
+; RV64I-NEXT: srli a3, t0, 16
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: srli a3, t0, 8
+; RV64I-NEXT: sb a3, 1(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 11(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 10(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: srli a1, a7, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a7, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a7, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a7, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a4, a4, 32
+; RV64I-NEXT: sb a4, 4(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a0, a0, 32
+; RV64I-NEXT: sb a0, 12(a2)
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_32bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -64
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 60(sp)
+; RV32I-NEXT: sw zero, 56(sp)
+; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 44(sp)
+; RV32I-NEXT: sw zero, 40(sp)
+; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: andi a1, a1, 28
+; RV32I-NEXT: mv a0, sp
+; RV32I-NEXT: add a3, a0, a1
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a1, 0(a3)
+; RV32I-NEXT: lw a4, 12(a3)
+; RV32I-NEXT: lw a5, 8(a3)
+; RV32I-NEXT: lw a6, 24(a3)
+; RV32I-NEXT: lw a7, 28(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw a3, 20(a3)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb a7, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb a3, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a4, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli t1, a6, 24
+; RV32I-NEXT: sb t1, 27(a2)
+; RV32I-NEXT: srli t1, a6, 16
+; RV32I-NEXT: sb t1, 26(a2)
+; RV32I-NEXT: srli a6, a6, 8
+; RV32I-NEXT: sb a6, 25(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 31(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 30(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 29(a2)
+; RV32I-NEXT: srli a6, t0, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli a6, t0, 16
+; RV32I-NEXT: sb a6, 18(a2)
+; RV32I-NEXT: srli a6, t0, 8
+; RV32I-NEXT: sb a6, 17(a2)
+; RV32I-NEXT: srli a6, a3, 24
+; RV32I-NEXT: sb a6, 23(a2)
+; RV32I-NEXT: srli a6, a3, 16
+; RV32I-NEXT: sb a6, 22(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: srli a3, a5, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, a5, 16
+; RV32I-NEXT: sb a3, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes_dwordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a1, 0(a1)
+; RV64I-NEXT: sd zero, 56(sp)
+; RV64I-NEXT: sd zero, 48(sp)
+; RV64I-NEXT: sd zero, 40(sp)
+; RV64I-NEXT: sd zero, 32(sp)
+; RV64I-NEXT: sd a0, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: andi a1, a1, 24
+; RV64I-NEXT: mv a0, sp
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ld a1, 16(a0)
+; RV64I-NEXT: ld a3, 24(a0)
+; RV64I-NEXT: ld a4, 0(a0)
+; RV64I-NEXT: ld a0, 8(a0)
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 24(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a5, a1, 56
+; RV64I-NEXT: sb a5, 23(a2)
+; RV64I-NEXT: srli a5, a1, 48
+; RV64I-NEXT: sb a5, 22(a2)
+; RV64I-NEXT: srli a5, a1, 40
+; RV64I-NEXT: sb a5, 21(a2)
+; RV64I-NEXT: srli a5, a1, 32
+; RV64I-NEXT: sb a5, 20(a2)
+; RV64I-NEXT: srli a5, a1, 24
+; RV64I-NEXT: sb a5, 19(a2)
+; RV64I-NEXT: srli a5, a1, 16
+; RV64I-NEXT: sb a5, 18(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: srli a1, a3, 56
+; RV64I-NEXT: sb a1, 31(a2)
+; RV64I-NEXT: srli a1, a3, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, a3, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, a3, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, a3, 24
+; RV64I-NEXT: sb a1, 27(a2)
+; RV64I-NEXT: srli a1, a3, 16
+; RV64I-NEXT: sb a1, 26(a2)
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a3, 25(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 1(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_32bytes_dwordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -64
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 60(sp)
+; RV32I-NEXT: sw zero, 56(sp)
+; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 44(sp)
+; RV32I-NEXT: sw zero, 40(sp)
+; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: andi a1, a1, 24
+; RV32I-NEXT: mv a0, sp
+; RV32I-NEXT: add a3, a0, a1
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a1, 0(a3)
+; RV32I-NEXT: lw a4, 12(a3)
+; RV32I-NEXT: lw a5, 8(a3)
+; RV32I-NEXT: lw a6, 24(a3)
+; RV32I-NEXT: lw a7, 28(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw a3, 20(a3)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb a7, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb a3, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a4, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli t1, a6, 24
+; RV32I-NEXT: sb t1, 27(a2)
+; RV32I-NEXT: srli t1, a6, 16
+; RV32I-NEXT: sb t1, 26(a2)
+; RV32I-NEXT: srli a6, a6, 8
+; RV32I-NEXT: sb a6, 25(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 31(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 30(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 29(a2)
+; RV32I-NEXT: srli a6, t0, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli a6, t0, 16
+; RV32I-NEXT: sb a6, 18(a2)
+; RV32I-NEXT: srli a6, t0, 8
+; RV32I-NEXT: sb a6, 17(a2)
+; RV32I-NEXT: srli a6, a3, 24
+; RV32I-NEXT: sb a6, 23(a2)
+; RV32I-NEXT: srli a6, a3, 16
+; RV32I-NEXT: sb a6, 22(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: srli a3, a5, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, a5, 16
+; RV32I-NEXT: sb a3, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 24(sp)
+; RV64I-NEXT: sd zero, 16(sp)
+; RV64I-NEXT: sd zero, 8(sp)
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a5, 48(sp)
+; RV64I-NEXT: sd a4, 40(sp)
+; RV64I-NEXT: sd a3, 32(sp)
+; RV64I-NEXT: andi a0, a1, 24
+; RV64I-NEXT: addi a3, sp, 32
+; RV64I-NEXT: sub a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: ld a5, 0(a3)
+; RV64I-NEXT: sll a6, a4, a1
+; RV64I-NEXT: andi a0, a1, 56
+; RV64I-NEXT: xori a7, a0, 63
+; RV64I-NEXT: srli a0, a5, 1
+; RV64I-NEXT: ld t0, 24(a3)
+; RV64I-NEXT: ld a3, 16(a3)
+; RV64I-NEXT: srl a0, a0, a7
+; RV64I-NEXT: or a0, a6, a0
+; RV64I-NEXT: sll t0, t0, a1
+; RV64I-NEXT: srli t1, a3, 1
+; RV64I-NEXT: srl t1, t1, a7
+; RV64I-NEXT: or t1, t0, t1
+; RV64I-NEXT: sll a3, a3, a1
+; RV64I-NEXT: srli a4, a4, 1
+; RV64I-NEXT: srl a4, a4, a7
+; RV64I-NEXT: or a4, a3, a4
+; RV64I-NEXT: sll a1, a5, a1
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: srli a3, a3, 56
+; RV64I-NEXT: sb a3, 23(a2)
+; RV64I-NEXT: srli a3, t0, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 7(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 6(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 5(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 3(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: srli a1, a6, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: sb t1, 24(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 18(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 17(a2)
+; RV64I-NEXT: srli a1, t1, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, t1, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, t1, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, t1, 24
+; RV64I-NEXT: sb a1, 27(a2)
+; RV64I-NEXT: srli a1, t1, 16
+; RV64I-NEXT: sb a1, 26(a2)
+; RV64I-NEXT: srli a1, t1, 8
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_32bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -80
+; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu t2, 1(a1)
+; RV32I-NEXT: lbu t3, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: or a1, a1, t2
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw t1, 56(sp)
+; RV32I-NEXT: sw t0, 52(sp)
+; RV32I-NEXT: sw a7, 48(sp)
+; RV32I-NEXT: sw a6, 44(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: andi a0, a1, 28
+; RV32I-NEXT: addi a3, sp, 32
+; RV32I-NEXT: sub a6, a3, a0
+; RV32I-NEXT: lw a3, 4(a6)
+; RV32I-NEXT: slli a7, a1, 3
+; RV32I-NEXT: lw t0, 0(a6)
+; RV32I-NEXT: sll a4, a3, a7
+; RV32I-NEXT: andi a0, a7, 24
+; RV32I-NEXT: xori t1, a0, 31
+; RV32I-NEXT: srli a0, t0, 1
+; RV32I-NEXT: lw t2, 12(a6)
+; RV32I-NEXT: lw a5, 8(a6)
+; RV32I-NEXT: srl a0, a0, t1
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: sll t3, t2, a7
+; RV32I-NEXT: srli a1, a5, 1
+; RV32I-NEXT: srl a1, a1, t1
+; RV32I-NEXT: or a1, t3, a1
+; RV32I-NEXT: sll t4, a5, a7
+; RV32I-NEXT: srli a3, a3, 1
+; RV32I-NEXT: lw t5, 20(a6)
+; RV32I-NEXT: lw t6, 16(a6)
+; RV32I-NEXT: srl a3, a3, t1
+; RV32I-NEXT: or a3, t4, a3
+; RV32I-NEXT: sll s0, t5, a7
+; RV32I-NEXT: srli a5, t6, 1
+; RV32I-NEXT: srl a5, a5, t1
+; RV32I-NEXT: or a5, s0, a5
+; RV32I-NEXT: sll t6, t6, a7
+; RV32I-NEXT: srli t2, t2, 1
+; RV32I-NEXT: lw s1, 28(a6)
+; RV32I-NEXT: lw a6, 24(a6)
+; RV32I-NEXT: srl t2, t2, t1
+; RV32I-NEXT: or t2, t6, t2
+; RV32I-NEXT: sll s1, s1, a7
+; RV32I-NEXT: srli s2, a6, 1
+; RV32I-NEXT: srl s2, s2, t1
+; RV32I-NEXT: or s2, s1, s2
+; RV32I-NEXT: sll a6, a6, a7
+; RV32I-NEXT: srli t5, t5, 1
+; RV32I-NEXT: srl t1, t5, t1
+; RV32I-NEXT: or t1, a6, t1
+; RV32I-NEXT: sll a7, t0, a7
+; RV32I-NEXT: sb a7, 0(a2)
+; RV32I-NEXT: srli a6, a6, 24
+; RV32I-NEXT: sb a6, 27(a2)
+; RV32I-NEXT: srli s1, s1, 24
+; RV32I-NEXT: sb s1, 31(a2)
+; RV32I-NEXT: srli a6, t6, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli s0, s0, 24
+; RV32I-NEXT: sb s0, 23(a2)
+; RV32I-NEXT: srli a6, t4, 24
+; RV32I-NEXT: sb a6, 11(a2)
+; RV32I-NEXT: srli a6, t3, 24
+; RV32I-NEXT: sb a6, 15(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 3(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 2(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 1(a2)
+; RV32I-NEXT: srli a4, a4, 24
+; RV32I-NEXT: sb a4, 7(a2)
+; RV32I-NEXT: sb t1, 24(a2)
+; RV32I-NEXT: sb s2, 28(a2)
+; RV32I-NEXT: sb t2, 16(a2)
+; RV32I-NEXT: sb a5, 20(a2)
+; RV32I-NEXT: sb a3, 8(a2)
+; RV32I-NEXT: sb a1, 12(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a4, t1, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, t1, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, s2, 16
+; RV32I-NEXT: sb a4, 30(a2)
+; RV32I-NEXT: srli a4, s2, 8
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: srli a4, t2, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t2, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 21(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 9(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 13(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 80
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 24(sp)
+; RV64I-NEXT: sd zero, 16(sp)
+; RV64I-NEXT: sd zero, 8(sp)
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a5, 48(sp)
+; RV64I-NEXT: sd a4, 40(sp)
+; RV64I-NEXT: sd a3, 32(sp)
+; RV64I-NEXT: slli a0, a1, 2
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: addi a3, sp, 32
+; RV64I-NEXT: sub a0, a3, a0
+; RV64I-NEXT: ld a4, 8(a0)
+; RV64I-NEXT: slli a5, a1, 5
+; RV64I-NEXT: ld a6, 0(a0)
+; RV64I-NEXT: sll a3, a4, a5
+; RV64I-NEXT: andi a1, a5, 32
+; RV64I-NEXT: xori a7, a1, 63
+; RV64I-NEXT: srli a1, a6, 1
+; RV64I-NEXT: ld t0, 24(a0)
+; RV64I-NEXT: ld t1, 16(a0)
+; RV64I-NEXT: srl a0, a1, a7
+; RV64I-NEXT: or a0, a3, a0
+; RV64I-NEXT: sll t0, t0, a5
+; RV64I-NEXT: srli a1, t1, 1
+; RV64I-NEXT: srl a1, a1, a7
+; RV64I-NEXT: or a1, t0, a1
+; RV64I-NEXT: sll t1, t1, a5
+; RV64I-NEXT: srli a4, a4, 1
+; RV64I-NEXT: srl a4, a4, a7
+; RV64I-NEXT: or a4, t1, a4
+; RV64I-NEXT: sll a5, a6, a5
+; RV64I-NEXT: sb a5, 0(a2)
+; RV64I-NEXT: srli a6, t1, 56
+; RV64I-NEXT: sb a6, 23(a2)
+; RV64I-NEXT: srli a6, t1, 48
+; RV64I-NEXT: sb a6, 22(a2)
+; RV64I-NEXT: srli a6, t1, 40
+; RV64I-NEXT: sb a6, 21(a2)
+; RV64I-NEXT: srli a6, t1, 32
+; RV64I-NEXT: sb a6, 20(a2)
+; RV64I-NEXT: srli a6, t0, 56
+; RV64I-NEXT: sb a6, 31(a2)
+; RV64I-NEXT: srli a6, t0, 48
+; RV64I-NEXT: sb a6, 30(a2)
+; RV64I-NEXT: srli a6, t0, 40
+; RV64I-NEXT: sb a6, 29(a2)
+; RV64I-NEXT: srli a6, t0, 32
+; RV64I-NEXT: sb a6, 28(a2)
+; RV64I-NEXT: srli a6, a5, 56
+; RV64I-NEXT: sb a6, 7(a2)
+; RV64I-NEXT: srli a6, a5, 48
+; RV64I-NEXT: sb a6, 6(a2)
+; RV64I-NEXT: srli a6, a5, 40
+; RV64I-NEXT: sb a6, 5(a2)
+; RV64I-NEXT: srli a6, a5, 32
+; RV64I-NEXT: sb a6, 4(a2)
+; RV64I-NEXT: srli a6, a5, 24
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: srli a6, a5, 16
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: srli a5, a5, 8
+; RV64I-NEXT: sb a5, 1(a2)
+; RV64I-NEXT: srli a5, a3, 56
+; RV64I-NEXT: sb a5, 15(a2)
+; RV64I-NEXT: srli a5, a3, 48
+; RV64I-NEXT: sb a5, 14(a2)
+; RV64I-NEXT: srli a5, a3, 40
+; RV64I-NEXT: sb a5, 13(a2)
+; RV64I-NEXT: srli a3, a3, 32
+; RV64I-NEXT: sb a3, 12(a2)
+; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: sb a1, 24(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a3, a4, 24
+; RV64I-NEXT: sb a3, 19(a2)
+; RV64I-NEXT: srli a3, a4, 16
+; RV64I-NEXT: sb a3, 18(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 17(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 27(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 26(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_32bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -64
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw t1, 56(sp)
+; RV32I-NEXT: sw t0, 52(sp)
+; RV32I-NEXT: sw a7, 48(sp)
+; RV32I-NEXT: sw a6, 44(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: andi a1, a1, 28
+; RV32I-NEXT: addi a0, sp, 32
+; RV32I-NEXT: sub a3, a0, a1
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a1, 0(a3)
+; RV32I-NEXT: lw a4, 12(a3)
+; RV32I-NEXT: lw a5, 8(a3)
+; RV32I-NEXT: lw a6, 24(a3)
+; RV32I-NEXT: lw a7, 28(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw a3, 20(a3)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb a7, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb a3, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a4, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli t1, a6, 24
+; RV32I-NEXT: sb t1, 27(a2)
+; RV32I-NEXT: srli t1, a6, 16
+; RV32I-NEXT: sb t1, 26(a2)
+; RV32I-NEXT: srli a6, a6, 8
+; RV32I-NEXT: sb a6, 25(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 31(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 30(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 29(a2)
+; RV32I-NEXT: srli a6, t0, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli a6, t0, 16
+; RV32I-NEXT: sb a6, 18(a2)
+; RV32I-NEXT: srli a6, t0, 8
+; RV32I-NEXT: sb a6, 17(a2)
+; RV32I-NEXT: srli a6, a3, 24
+; RV32I-NEXT: sb a6, 23(a2)
+; RV32I-NEXT: srli a6, a3, 16
+; RV32I-NEXT: sb a6, 22(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: srli a3, a5, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, a5, 16
+; RV32I-NEXT: sb a3, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes_dwordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a1, 0(a1)
+; RV64I-NEXT: sd zero, 24(sp)
+; RV64I-NEXT: sd zero, 16(sp)
+; RV64I-NEXT: sd zero, 8(sp)
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a5, 48(sp)
+; RV64I-NEXT: sd a4, 40(sp)
+; RV64I-NEXT: sd a3, 32(sp)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: andi a1, a1, 24
+; RV64I-NEXT: addi a0, sp, 32
+; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: ld a1, 16(a0)
+; RV64I-NEXT: ld a3, 24(a0)
+; RV64I-NEXT: ld a4, 0(a0)
+; RV64I-NEXT: ld a0, 8(a0)
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 24(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a5, a1, 56
+; RV64I-NEXT: sb a5, 23(a2)
+; RV64I-NEXT: srli a5, a1, 48
+; RV64I-NEXT: sb a5, 22(a2)
+; RV64I-NEXT: srli a5, a1, 40
+; RV64I-NEXT: sb a5, 21(a2)
+; RV64I-NEXT: srli a5, a1, 32
+; RV64I-NEXT: sb a5, 20(a2)
+; RV64I-NEXT: srli a5, a1, 24
+; RV64I-NEXT: sb a5, 19(a2)
+; RV64I-NEXT: srli a5, a1, 16
+; RV64I-NEXT: sb a5, 18(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: srli a1, a3, 56
+; RV64I-NEXT: sb a1, 31(a2)
+; RV64I-NEXT: srli a1, a3, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, a3, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, a3, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, a3, 24
+; RV64I-NEXT: sb a1, 27(a2)
+; RV64I-NEXT: srli a1, a3, 16
+; RV64I-NEXT: sb a1, 26(a2)
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a3, 25(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 1(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_32bytes_dwordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -64
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw t1, 56(sp)
+; RV32I-NEXT: sw t0, 52(sp)
+; RV32I-NEXT: sw a7, 48(sp)
+; RV32I-NEXT: sw a6, 44(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: andi a1, a1, 24
+; RV32I-NEXT: addi a0, sp, 32
+; RV32I-NEXT: sub a3, a0, a1
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a1, 0(a3)
+; RV32I-NEXT: lw a4, 12(a3)
+; RV32I-NEXT: lw a5, 8(a3)
+; RV32I-NEXT: lw a6, 24(a3)
+; RV32I-NEXT: lw a7, 28(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw a3, 20(a3)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb a7, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb a3, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a4, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli t1, a6, 24
+; RV32I-NEXT: sb t1, 27(a2)
+; RV32I-NEXT: srli t1, a6, 16
+; RV32I-NEXT: sb t1, 26(a2)
+; RV32I-NEXT: srli a6, a6, 8
+; RV32I-NEXT: sb a6, 25(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 31(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 30(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 29(a2)
+; RV32I-NEXT: srli a6, t0, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli a6, t0, 16
+; RV32I-NEXT: sb a6, 18(a2)
+; RV32I-NEXT: srli a6, t0, 8
+; RV32I-NEXT: sb a6, 17(a2)
+; RV32I-NEXT: srli a6, a3, 24
+; RV32I-NEXT: sb a6, 23(a2)
+; RV32I-NEXT: srli a6, a3, 16
+; RV32I-NEXT: sb a6, 22(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: srli a3, a5, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, a5, 16
+; RV32I-NEXT: sb a3, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a7, a0, 32
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 1(a1)
+; RV64I-NEXT: lbu t0, 0(a1)
+; RV64I-NEXT: lbu t1, 2(a1)
+; RV64I-NEXT: lbu t2, 3(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu t0, 5(a1)
+; RV64I-NEXT: lbu t1, 4(a1)
+; RV64I-NEXT: lbu t2, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t2
+; RV64I-NEXT: or a1, a1, t0
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: sraiw a0, a0, 31
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a0, 40(sp)
+; RV64I-NEXT: sd a0, 32(sp)
+; RV64I-NEXT: sd a6, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: andi a0, a1, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: srl a5, a4, a1
+; RV64I-NEXT: ld a6, 16(a3)
+; RV64I-NEXT: andi a0, a1, 56
+; RV64I-NEXT: xori a7, a0, 63
+; RV64I-NEXT: ld t0, 0(a3)
+; RV64I-NEXT: slli a0, a6, 1
+; RV64I-NEXT: sll a0, a0, a7
+; RV64I-NEXT: or a0, a5, a0
+; RV64I-NEXT: srl t0, t0, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a7
+; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: srl a6, a6, a1
+; RV64I-NEXT: slli t1, a3, 1
+; RV64I-NEXT: sll a7, t1, a7
+; RV64I-NEXT: or a7, a6, a7
+; RV64I-NEXT: sra a1, a3, a1
+; RV64I-NEXT: sb a6, 16(a2)
+; RV64I-NEXT: sb a1, 24(a2)
+; RV64I-NEXT: sb t0, 0(a2)
+; RV64I-NEXT: sb a5, 8(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 30(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 28(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 27(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 26(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: srli a1, a7, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a7, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a7, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a7, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a7, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a7, 16
+; RV64I-NEXT: sb a1, 18(a2)
+; RV64I-NEXT: srli a1, a7, 8
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 1(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: ashr_32bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -80
+; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, a0, t4
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: lbu t3, 1(a1)
+; RV32I-NEXT: lbu t4, 0(a1)
+; RV32I-NEXT: lbu t5, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or t3, t3, t4
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t5
+; RV32I-NEXT: or a1, a1, t3
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a0, 56(sp)
+; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 44(sp)
+; RV32I-NEXT: sw a0, 40(sp)
+; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: andi a0, a1, 28
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a5, a3, a0
+; RV32I-NEXT: lw a3, 4(a5)
+; RV32I-NEXT: slli a6, a1, 3
+; RV32I-NEXT: srl a4, a3, a6
+; RV32I-NEXT: lw a7, 8(a5)
+; RV32I-NEXT: andi a0, a6, 24
+; RV32I-NEXT: xori t0, a0, 31
+; RV32I-NEXT: lw a1, 0(a5)
+; RV32I-NEXT: slli a0, a7, 1
+; RV32I-NEXT: sll a0, a0, t0
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: srl t1, a1, a6
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw t2, 12(a5)
+; RV32I-NEXT: lw t3, 16(a5)
+; RV32I-NEXT: sll a1, a3, t0
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: srl t4, t2, a6
+; RV32I-NEXT: slli a3, t3, 1
+; RV32I-NEXT: sll a3, a3, t0
+; RV32I-NEXT: or a3, t4, a3
+; RV32I-NEXT: srl a7, a7, a6
+; RV32I-NEXT: slli t2, t2, 1
+; RV32I-NEXT: lw t5, 20(a5)
+; RV32I-NEXT: lw t6, 24(a5)
+; RV32I-NEXT: sll t2, t2, t0
+; RV32I-NEXT: or t2, a7, t2
+; RV32I-NEXT: srl s0, t5, a6
+; RV32I-NEXT: slli s1, t6, 1
+; RV32I-NEXT: sll s1, s1, t0
+; RV32I-NEXT: or s1, s0, s1
+; RV32I-NEXT: srl t3, t3, a6
+; RV32I-NEXT: slli t5, t5, 1
+; RV32I-NEXT: lw a5, 28(a5)
+; RV32I-NEXT: sll t5, t5, t0
+; RV32I-NEXT: or t5, t3, t5
+; RV32I-NEXT: srl t6, t6, a6
+; RV32I-NEXT: slli s2, a5, 1
+; RV32I-NEXT: sll t0, s2, t0
+; RV32I-NEXT: or t0, t6, t0
+; RV32I-NEXT: sra a5, a5, a6
+; RV32I-NEXT: sb t6, 24(a2)
+; RV32I-NEXT: sb a5, 28(a2)
+; RV32I-NEXT: sb t3, 16(a2)
+; RV32I-NEXT: sb s0, 20(a2)
+; RV32I-NEXT: sb a7, 8(a2)
+; RV32I-NEXT: sb t4, 12(a2)
+; RV32I-NEXT: sb t1, 0(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a4, a5, 24
+; RV32I-NEXT: sb a4, 31(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 30(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 29(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 27(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, t5, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t5, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t5, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, s1, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, s1, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli s1, s1, 8
+; RV32I-NEXT: sb s1, 21(a2)
+; RV32I-NEXT: srli a4, t2, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, t2, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a4, t2, 8
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 15(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 14(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 80
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a7, a0, 32
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 1(a1)
+; RV64I-NEXT: lbu t0, 0(a1)
+; RV64I-NEXT: lbu t1, 2(a1)
+; RV64I-NEXT: lbu t2, 3(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu t0, 5(a1)
+; RV64I-NEXT: lbu t1, 4(a1)
+; RV64I-NEXT: lbu t2, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t2
+; RV64I-NEXT: or a1, a1, t0
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: sraiw a0, a0, 31
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a0, 40(sp)
+; RV64I-NEXT: sd a0, 32(sp)
+; RV64I-NEXT: sd a6, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: slli a0, a1, 2
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: slli a5, a1, 5
+; RV64I-NEXT: srl a1, a4, a5
+; RV64I-NEXT: ld a6, 16(a3)
+; RV64I-NEXT: andi a0, a5, 32
+; RV64I-NEXT: xori a7, a0, 63
+; RV64I-NEXT: ld t0, 0(a3)
+; RV64I-NEXT: slli a0, a6, 1
+; RV64I-NEXT: sll a0, a0, a7
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srl t0, t0, a5
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a7
+; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: srl a6, a6, a5
+; RV64I-NEXT: slli t1, a3, 1
+; RV64I-NEXT: sll a7, t1, a7
+; RV64I-NEXT: or a7, a6, a7
+; RV64I-NEXT: sra a3, a3, a5
+; RV64I-NEXT: sb a6, 16(a2)
+; RV64I-NEXT: sb a3, 24(a2)
+; RV64I-NEXT: sb t0, 0(a2)
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: srli a5, a6, 24
+; RV64I-NEXT: sb a5, 19(a2)
+; RV64I-NEXT: srli a5, a6, 16
+; RV64I-NEXT: sb a5, 18(a2)
+; RV64I-NEXT: srli a5, a6, 8
+; RV64I-NEXT: sb a5, 17(a2)
+; RV64I-NEXT: srli a5, a3, 56
+; RV64I-NEXT: sb a5, 31(a2)
+; RV64I-NEXT: srli a5, a3, 48
+; RV64I-NEXT: sb a5, 30(a2)
+; RV64I-NEXT: srli a5, a3, 40
+; RV64I-NEXT: sb a5, 29(a2)
+; RV64I-NEXT: srli a5, a3, 32
+; RV64I-NEXT: sb a5, 28(a2)
+; RV64I-NEXT: srli a5, a3, 24
+; RV64I-NEXT: sb a5, 27(a2)
+; RV64I-NEXT: srli a5, a3, 16
+; RV64I-NEXT: sb a5, 26(a2)
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a3, 25(a2)
+; RV64I-NEXT: srli a3, t0, 24
+; RV64I-NEXT: sb a3, 3(a2)
+; RV64I-NEXT: srli a3, t0, 16
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: srli a3, t0, 8
+; RV64I-NEXT: sb a3, 1(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 11(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 10(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: srli a1, a7, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a7, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a7, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a7, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a4, a4, 32
+; RV64I-NEXT: sb a4, 4(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sb a0, 12(a2)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 11(a2)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 10(a2)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 9(a2)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
-; RV32I-LABEL: shl_32bytes:
+; RV32I-LABEL: ashr_32bytes_wordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -64
; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s10, 22(a0)
-; RV32I-NEXT: lbu s11, 23(a0)
-; RV32I-NEXT: lbu ra, 24(a0)
-; RV32I-NEXT: lbu t0, 25(a0)
-; RV32I-NEXT: lbu a7, 26(a0)
-; RV32I-NEXT: lbu a6, 27(a0)
-; RV32I-NEXT: lbu a5, 28(a0)
-; RV32I-NEXT: lbu a3, 31(a0)
-; RV32I-NEXT: lbu a4, 30(a0)
-; RV32I-NEXT: lbu a0, 29(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, a0, t4
+; RV32I-NEXT: or t2, t3, t2
; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb a3, 91(sp)
-; RV32I-NEXT: sb a4, 90(sp)
-; RV32I-NEXT: sb a0, 89(sp)
-; RV32I-NEXT: sb a5, 88(sp)
-; RV32I-NEXT: sb a6, 87(sp)
-; RV32I-NEXT: sb a7, 86(sp)
-; RV32I-NEXT: sb zero, 59(sp)
-; RV32I-NEXT: sb zero, 58(sp)
-; RV32I-NEXT: sb zero, 57(sp)
-; RV32I-NEXT: sb zero, 56(sp)
-; RV32I-NEXT: sb zero, 55(sp)
-; RV32I-NEXT: sb zero, 54(sp)
-; RV32I-NEXT: sb zero, 53(sp)
-; RV32I-NEXT: sb zero, 52(sp)
-; RV32I-NEXT: sb zero, 51(sp)
-; RV32I-NEXT: sb zero, 50(sp)
-; RV32I-NEXT: sb zero, 49(sp)
-; RV32I-NEXT: sb zero, 48(sp)
-; RV32I-NEXT: sb zero, 47(sp)
-; RV32I-NEXT: sb zero, 46(sp)
-; RV32I-NEXT: sb zero, 45(sp)
-; RV32I-NEXT: sb zero, 44(sp)
-; RV32I-NEXT: sb zero, 43(sp)
-; RV32I-NEXT: sb zero, 42(sp)
-; RV32I-NEXT: sb zero, 41(sp)
-; RV32I-NEXT: sb zero, 40(sp)
-; RV32I-NEXT: sb zero, 39(sp)
-; RV32I-NEXT: sb zero, 38(sp)
-; RV32I-NEXT: sb zero, 37(sp)
-; RV32I-NEXT: sb zero, 36(sp)
-; RV32I-NEXT: sb zero, 35(sp)
-; RV32I-NEXT: sb zero, 34(sp)
-; RV32I-NEXT: sb zero, 33(sp)
-; RV32I-NEXT: sb zero, 32(sp)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb t0, 85(sp)
-; RV32I-NEXT: sb ra, 84(sp)
-; RV32I-NEXT: sb s11, 83(sp)
-; RV32I-NEXT: sb s10, 82(sp)
-; RV32I-NEXT: sb s9, 81(sp)
-; RV32I-NEXT: sb s8, 80(sp)
-; RV32I-NEXT: sb s7, 79(sp)
-; RV32I-NEXT: sb s6, 78(sp)
-; RV32I-NEXT: sb s5, 77(sp)
-; RV32I-NEXT: sb s4, 76(sp)
-; RV32I-NEXT: sb s3, 75(sp)
-; RV32I-NEXT: sb s2, 74(sp)
-; RV32I-NEXT: sb s1, 73(sp)
-; RV32I-NEXT: sb s0, 72(sp)
-; RV32I-NEXT: sb t6, 71(sp)
-; RV32I-NEXT: sb t5, 70(sp)
-; RV32I-NEXT: sb t4, 69(sp)
-; RV32I-NEXT: sb t3, 68(sp)
-; RV32I-NEXT: sb t2, 67(sp)
-; RV32I-NEXT: sb t1, 66(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 65(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 64(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 63(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 62(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 61(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 60(sp)
-; RV32I-NEXT: andi a1, a1, 31
-; RV32I-NEXT: addi a0, sp, 60
-; RV32I-NEXT: sub a6, a0, a1
-; RV32I-NEXT: lbu a0, 6(a6)
-; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 7(a6)
-; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 4(a6)
-; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 5(a6)
-; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 0(a6)
-; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a7, 1(a6)
-; RV32I-NEXT: lbu t0, 2(a6)
-; RV32I-NEXT: lbu t1, 3(a6)
-; RV32I-NEXT: lbu t2, 14(a6)
-; RV32I-NEXT: lbu t3, 15(a6)
-; RV32I-NEXT: lbu t4, 12(a6)
-; RV32I-NEXT: lbu t5, 13(a6)
-; RV32I-NEXT: lbu t6, 10(a6)
-; RV32I-NEXT: lbu s0, 11(a6)
-; RV32I-NEXT: lbu s1, 8(a6)
-; RV32I-NEXT: lbu s2, 9(a6)
-; RV32I-NEXT: lbu s3, 22(a6)
-; RV32I-NEXT: lbu s4, 23(a6)
-; RV32I-NEXT: lbu s5, 20(a6)
-; RV32I-NEXT: lbu s6, 21(a6)
-; RV32I-NEXT: lbu s7, 18(a6)
-; RV32I-NEXT: lbu s8, 19(a6)
-; RV32I-NEXT: lbu s9, 16(a6)
-; RV32I-NEXT: lbu s10, 17(a6)
-; RV32I-NEXT: lbu s11, 30(a6)
-; RV32I-NEXT: lbu ra, 31(a6)
-; RV32I-NEXT: lbu a5, 28(a6)
-; RV32I-NEXT: lbu a4, 29(a6)
-; RV32I-NEXT: lbu a0, 25(a6)
-; RV32I-NEXT: lbu a1, 24(a6)
-; RV32I-NEXT: lbu a3, 27(a6)
-; RV32I-NEXT: lbu a6, 26(a6)
-; RV32I-NEXT: sb a0, 25(a2)
-; RV32I-NEXT: sb a1, 24(a2)
-; RV32I-NEXT: sb a3, 27(a2)
-; RV32I-NEXT: sb a6, 26(a2)
-; RV32I-NEXT: sb a4, 29(a2)
-; RV32I-NEXT: sb a5, 28(a2)
-; RV32I-NEXT: sb ra, 31(a2)
-; RV32I-NEXT: sb s11, 30(a2)
-; RV32I-NEXT: sb s10, 17(a2)
-; RV32I-NEXT: sb s9, 16(a2)
-; RV32I-NEXT: sb s8, 19(a2)
-; RV32I-NEXT: sb s7, 18(a2)
-; RV32I-NEXT: sb s6, 21(a2)
-; RV32I-NEXT: sb s5, 20(a2)
-; RV32I-NEXT: sb s4, 23(a2)
-; RV32I-NEXT: sb s3, 22(a2)
-; RV32I-NEXT: sb s2, 9(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb t5, 13(a2)
-; RV32I-NEXT: sb t4, 12(a2)
-; RV32I-NEXT: sb t3, 15(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t1, 3(a2)
-; RV32I-NEXT: sb t0, 2(a2)
-; RV32I-NEXT: sb a7, 1(a2)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 0(a2)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a0, 56(sp)
+; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 44(sp)
+; RV32I-NEXT: sw a0, 40(sp)
+; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: andi a1, a1, 28
+; RV32I-NEXT: mv a0, sp
+; RV32I-NEXT: add a3, a0, a1
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a1, 0(a3)
+; RV32I-NEXT: lw a4, 12(a3)
+; RV32I-NEXT: lw a5, 8(a3)
+; RV32I-NEXT: lw a6, 24(a3)
+; RV32I-NEXT: lw a7, 28(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw a3, 20(a3)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb a7, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb a3, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a4, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 6(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: srli t1, a6, 24
+; RV32I-NEXT: sb t1, 27(a2)
+; RV32I-NEXT: srli t1, a6, 16
+; RV32I-NEXT: sb t1, 26(a2)
+; RV32I-NEXT: srli a6, a6, 8
+; RV32I-NEXT: sb a6, 25(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 31(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 30(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 29(a2)
+; RV32I-NEXT: srli a6, t0, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli a6, t0, 16
+; RV32I-NEXT: sb a6, 18(a2)
+; RV32I-NEXT: srli a6, t0, 8
+; RV32I-NEXT: sb a6, 17(a2)
+; RV32I-NEXT: srli a6, a3, 24
+; RV32I-NEXT: sb a6, 23(a2)
+; RV32I-NEXT: srli a6, a3, 16
+; RV32I-NEXT: sb a6, 22(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: srli a3, a5, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, a5, 16
+; RV32I-NEXT: sb a3, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
- %res = shl i256 %src, %bitOff
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = ashr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
-define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; RV64I-LABEL: ashr_32bytes:
+
+define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes_dwordOff:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: mv t0, a1
-; RV64I-NEXT: lbu t1, 31(a0)
-; RV64I-NEXT: lbu a1, 0(a0)
-; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 1(a0)
-; RV64I-NEXT: sd a1, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 2(a0)
-; RV64I-NEXT: sd a1, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 3(a0)
-; RV64I-NEXT: sd a1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 4(a0)
-; RV64I-NEXT: sd a1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 5(a0)
-; RV64I-NEXT: sd a1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t2, 6(a0)
-; RV64I-NEXT: lbu t3, 7(a0)
-; RV64I-NEXT: lbu t4, 8(a0)
-; RV64I-NEXT: lbu t5, 9(a0)
-; RV64I-NEXT: lbu t6, 10(a0)
-; RV64I-NEXT: lbu s0, 11(a0)
-; RV64I-NEXT: lbu s1, 12(a0)
-; RV64I-NEXT: lbu s2, 13(a0)
-; RV64I-NEXT: lbu s3, 14(a0)
-; RV64I-NEXT: lbu s4, 15(a0)
-; RV64I-NEXT: lbu s5, 16(a0)
-; RV64I-NEXT: lbu s6, 17(a0)
-; RV64I-NEXT: lbu s7, 18(a0)
-; RV64I-NEXT: lbu s8, 19(a0)
-; RV64I-NEXT: lbu s9, 20(a0)
-; RV64I-NEXT: lbu s10, 21(a0)
-; RV64I-NEXT: lbu s11, 22(a0)
-; RV64I-NEXT: lbu ra, 23(a0)
-; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: lbu a6, 25(a0)
-; RV64I-NEXT: lbu a5, 26(a0)
-; RV64I-NEXT: lbu a4, 27(a0)
-; RV64I-NEXT: lbu a1, 30(a0)
-; RV64I-NEXT: lbu a3, 29(a0)
-; RV64I-NEXT: lbu a0, 28(a0)
-; RV64I-NEXT: lbu t0, 0(t0)
-; RV64I-NEXT: sb a1, 86(sp)
-; RV64I-NEXT: sb a3, 85(sp)
-; RV64I-NEXT: sb a0, 84(sp)
-; RV64I-NEXT: sb a4, 83(sp)
-; RV64I-NEXT: sb a5, 82(sp)
-; RV64I-NEXT: sb a6, 81(sp)
-; RV64I-NEXT: sb t1, 87(sp)
-; RV64I-NEXT: slli t1, t1, 56
-; RV64I-NEXT: sb a7, 80(sp)
-; RV64I-NEXT: sb ra, 79(sp)
-; RV64I-NEXT: sb s11, 78(sp)
-; RV64I-NEXT: sb s10, 77(sp)
-; RV64I-NEXT: sb s9, 76(sp)
-; RV64I-NEXT: sb s8, 75(sp)
-; RV64I-NEXT: sb s7, 74(sp)
-; RV64I-NEXT: sb s6, 73(sp)
-; RV64I-NEXT: sb s5, 72(sp)
-; RV64I-NEXT: sb s4, 71(sp)
-; RV64I-NEXT: sb s3, 70(sp)
-; RV64I-NEXT: sb s2, 69(sp)
-; RV64I-NEXT: sb s1, 68(sp)
-; RV64I-NEXT: sb s0, 67(sp)
-; RV64I-NEXT: sb t6, 66(sp)
-; RV64I-NEXT: sb t5, 65(sp)
-; RV64I-NEXT: sb t4, 64(sp)
-; RV64I-NEXT: sb t3, 63(sp)
-; RV64I-NEXT: sb t2, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: srai a0, t1, 63
-; RV64I-NEXT: sb a0, 112(sp)
-; RV64I-NEXT: sb a0, 104(sp)
-; RV64I-NEXT: sb a0, 96(sp)
-; RV64I-NEXT: sb a0, 88(sp)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a7, a0, 32
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a1, 0(a1)
+; RV64I-NEXT: sraiw a0, a0, 31
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a0, 40(sp)
+; RV64I-NEXT: sd a0, 32(sp)
+; RV64I-NEXT: sd a6, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: andi a1, a1, 24
+; RV64I-NEXT: mv a0, sp
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ld a1, 16(a0)
+; RV64I-NEXT: ld a3, 24(a0)
+; RV64I-NEXT: ld a4, 0(a0)
+; RV64I-NEXT: ld a0, 8(a0)
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 24(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a5, a1, 56
+; RV64I-NEXT: sb a5, 23(a2)
+; RV64I-NEXT: srli a5, a1, 48
+; RV64I-NEXT: sb a5, 22(a2)
+; RV64I-NEXT: srli a5, a1, 40
+; RV64I-NEXT: sb a5, 21(a2)
+; RV64I-NEXT: srli a5, a1, 32
+; RV64I-NEXT: sb a5, 20(a2)
+; RV64I-NEXT: srli a5, a1, 24
+; RV64I-NEXT: sb a5, 19(a2)
+; RV64I-NEXT: srli a5, a1, 16
+; RV64I-NEXT: sb a5, 18(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: srli a1, a3, 56
+; RV64I-NEXT: sb a1, 31(a2)
+; RV64I-NEXT: srli a1, a3, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, a3, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, a3, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, a3, 24
+; RV64I-NEXT: sb a1, 27(a2)
+; RV64I-NEXT: srli a1, a3, 16
+; RV64I-NEXT: sb a1, 26(a2)
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a3, 25(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 1(a2)
; RV64I-NEXT: srli a1, a0, 56
-; RV64I-NEXT: sb a1, 119(sp)
-; RV64I-NEXT: srli a3, a0, 48
-; RV64I-NEXT: sb a3, 118(sp)
-; RV64I-NEXT: srli a4, a0, 40
-; RV64I-NEXT: sb a4, 117(sp)
-; RV64I-NEXT: srli a5, a0, 32
-; RV64I-NEXT: sb a5, 116(sp)
-; RV64I-NEXT: srli a6, a0, 24
-; RV64I-NEXT: sb a6, 115(sp)
-; RV64I-NEXT: srli a7, a0, 16
-; RV64I-NEXT: sb a7, 114(sp)
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: sb a0, 113(sp)
-; RV64I-NEXT: sb a1, 111(sp)
-; RV64I-NEXT: sb a3, 110(sp)
-; RV64I-NEXT: sb a4, 109(sp)
-; RV64I-NEXT: sb a5, 108(sp)
-; RV64I-NEXT: sb a6, 107(sp)
-; RV64I-NEXT: sb a7, 106(sp)
-; RV64I-NEXT: sb a0, 105(sp)
-; RV64I-NEXT: sb a1, 103(sp)
-; RV64I-NEXT: sb a3, 102(sp)
-; RV64I-NEXT: sb a4, 101(sp)
-; RV64I-NEXT: sb a5, 100(sp)
-; RV64I-NEXT: sb a6, 99(sp)
-; RV64I-NEXT: sb a7, 98(sp)
-; RV64I-NEXT: sb a0, 97(sp)
-; RV64I-NEXT: sb a1, 95(sp)
-; RV64I-NEXT: sb a3, 94(sp)
-; RV64I-NEXT: sb a4, 93(sp)
-; RV64I-NEXT: sb a5, 92(sp)
-; RV64I-NEXT: sb a6, 91(sp)
-; RV64I-NEXT: sb a7, 90(sp)
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: andi a0, t0, 31
-; RV64I-NEXT: addi a1, sp, 56
-; RV64I-NEXT: add a6, a1, a0
-; RV64I-NEXT: lbu a0, 8(a6)
-; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 9(a6)
-; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 10(a6)
-; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 11(a6)
-; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 12(a6)
-; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a7, 13(a6)
-; RV64I-NEXT: lbu t0, 14(a6)
-; RV64I-NEXT: lbu t1, 15(a6)
-; RV64I-NEXT: lbu t2, 0(a6)
-; RV64I-NEXT: lbu t3, 1(a6)
-; RV64I-NEXT: lbu t4, 2(a6)
-; RV64I-NEXT: lbu t5, 3(a6)
-; RV64I-NEXT: lbu t6, 4(a6)
-; RV64I-NEXT: lbu s0, 5(a6)
-; RV64I-NEXT: lbu s1, 6(a6)
-; RV64I-NEXT: lbu s2, 7(a6)
-; RV64I-NEXT: lbu s3, 24(a6)
-; RV64I-NEXT: lbu s4, 25(a6)
-; RV64I-NEXT: lbu s5, 26(a6)
-; RV64I-NEXT: lbu s6, 27(a6)
-; RV64I-NEXT: lbu s7, 28(a6)
-; RV64I-NEXT: lbu s8, 29(a6)
-; RV64I-NEXT: lbu s9, 30(a6)
-; RV64I-NEXT: lbu s10, 31(a6)
-; RV64I-NEXT: lbu s11, 16(a6)
-; RV64I-NEXT: lbu ra, 17(a6)
-; RV64I-NEXT: lbu a5, 18(a6)
-; RV64I-NEXT: lbu a4, 19(a6)
-; RV64I-NEXT: lbu a0, 23(a6)
-; RV64I-NEXT: lbu a1, 22(a6)
-; RV64I-NEXT: lbu a3, 21(a6)
-; RV64I-NEXT: lbu a6, 20(a6)
-; RV64I-NEXT: sb a0, 23(a2)
-; RV64I-NEXT: sb a1, 22(a2)
-; RV64I-NEXT: sb a3, 21(a2)
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: sb a5, 18(a2)
-; RV64I-NEXT: sb ra, 17(a2)
-; RV64I-NEXT: sb s11, 16(a2)
-; RV64I-NEXT: sb s10, 31(a2)
-; RV64I-NEXT: sb s9, 30(a2)
-; RV64I-NEXT: sb s8, 29(a2)
-; RV64I-NEXT: sb s7, 28(a2)
-; RV64I-NEXT: sb s6, 27(a2)
-; RV64I-NEXT: sb s5, 26(a2)
-; RV64I-NEXT: sb s4, 25(a2)
-; RV64I-NEXT: sb s3, 24(a2)
-; RV64I-NEXT: sb s2, 7(a2)
-; RV64I-NEXT: sb s1, 6(a2)
-; RV64I-NEXT: sb s0, 5(a2)
-; RV64I-NEXT: sb t6, 4(a2)
-; RV64I-NEXT: sb t5, 3(a2)
-; RV64I-NEXT: sb t4, 2(a2)
-; RV64I-NEXT: sb t3, 1(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t1, 15(a2)
-; RV64I-NEXT: sb t0, 14(a2)
-; RV64I-NEXT: sb a7, 13(a2)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 12(a2)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 11(a2)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 10(a2)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: sb a0, 9(a2)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
-; RV32I-LABEL: ashr_32bytes:
+; RV32I-LABEL: ashr_32bytes_dwordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv t0, a1
-; RV32I-NEXT: lbu t1, 31(a0)
-; RV32I-NEXT: lbu a1, 0(a0)
-; RV32I-NEXT: sw a1, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 1(a0)
-; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 2(a0)
-; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 3(a0)
-; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 4(a0)
-; RV32I-NEXT: sw a1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 5(a0)
-; RV32I-NEXT: sw a1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t2, 6(a0)
-; RV32I-NEXT: lbu t3, 7(a0)
-; RV32I-NEXT: lbu t4, 8(a0)
-; RV32I-NEXT: lbu t5, 9(a0)
-; RV32I-NEXT: lbu t6, 10(a0)
-; RV32I-NEXT: lbu s0, 11(a0)
-; RV32I-NEXT: lbu s1, 12(a0)
-; RV32I-NEXT: lbu s2, 13(a0)
-; RV32I-NEXT: lbu s3, 14(a0)
-; RV32I-NEXT: lbu s4, 15(a0)
-; RV32I-NEXT: lbu s5, 16(a0)
-; RV32I-NEXT: lbu s6, 17(a0)
-; RV32I-NEXT: lbu s7, 18(a0)
-; RV32I-NEXT: lbu s8, 19(a0)
-; RV32I-NEXT: lbu s9, 20(a0)
-; RV32I-NEXT: lbu s10, 21(a0)
-; RV32I-NEXT: lbu s11, 22(a0)
-; RV32I-NEXT: lbu ra, 23(a0)
-; RV32I-NEXT: lbu a7, 24(a0)
-; RV32I-NEXT: lbu a6, 25(a0)
-; RV32I-NEXT: lbu a5, 26(a0)
-; RV32I-NEXT: lbu a4, 27(a0)
-; RV32I-NEXT: lbu a1, 30(a0)
-; RV32I-NEXT: lbu a3, 29(a0)
-; RV32I-NEXT: lbu a0, 28(a0)
-; RV32I-NEXT: lbu t0, 0(t0)
-; RV32I-NEXT: sb a1, 58(sp)
-; RV32I-NEXT: sb a3, 57(sp)
-; RV32I-NEXT: sb a0, 56(sp)
-; RV32I-NEXT: sb a4, 55(sp)
-; RV32I-NEXT: sb a5, 54(sp)
-; RV32I-NEXT: sb a6, 53(sp)
-; RV32I-NEXT: sb t1, 59(sp)
+; RV32I-NEXT: addi sp, sp, -64
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: sb a7, 52(sp)
-; RV32I-NEXT: sb ra, 51(sp)
-; RV32I-NEXT: sb s11, 50(sp)
-; RV32I-NEXT: sb s10, 49(sp)
-; RV32I-NEXT: sb s9, 48(sp)
-; RV32I-NEXT: sb s8, 47(sp)
-; RV32I-NEXT: sb s7, 46(sp)
-; RV32I-NEXT: sb s6, 45(sp)
-; RV32I-NEXT: sb s5, 44(sp)
-; RV32I-NEXT: sb s4, 43(sp)
-; RV32I-NEXT: sb s3, 42(sp)
-; RV32I-NEXT: sb s2, 41(sp)
-; RV32I-NEXT: sb s1, 40(sp)
-; RV32I-NEXT: sb s0, 39(sp)
-; RV32I-NEXT: sb t6, 38(sp)
-; RV32I-NEXT: sb t5, 37(sp)
-; RV32I-NEXT: sb t4, 36(sp)
-; RV32I-NEXT: sb t3, 35(sp)
-; RV32I-NEXT: sb t2, 34(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 33(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 32(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 30(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 29(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 28(sp)
-; RV32I-NEXT: srai a0, t1, 31
-; RV32I-NEXT: sb a0, 88(sp)
-; RV32I-NEXT: sb a0, 84(sp)
-; RV32I-NEXT: sb a0, 80(sp)
-; RV32I-NEXT: sb a0, 76(sp)
-; RV32I-NEXT: sb a0, 72(sp)
-; RV32I-NEXT: sb a0, 68(sp)
-; RV32I-NEXT: sb a0, 64(sp)
-; RV32I-NEXT: sb a0, 60(sp)
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, a0, t4
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a0, 56(sp)
+; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 44(sp)
+; RV32I-NEXT: sw a0, 40(sp)
+; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: andi a1, a1, 24
+; RV32I-NEXT: mv a0, sp
+; RV32I-NEXT: add a3, a0, a1
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a1, 0(a3)
+; RV32I-NEXT: lw a4, 12(a3)
+; RV32I-NEXT: lw a5, 8(a3)
+; RV32I-NEXT: lw a6, 24(a3)
+; RV32I-NEXT: lw a7, 28(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw a3, 20(a3)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb a7, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb a3, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a4, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli t1, a6, 24
+; RV32I-NEXT: sb t1, 27(a2)
+; RV32I-NEXT: srli t1, a6, 16
+; RV32I-NEXT: sb t1, 26(a2)
+; RV32I-NEXT: srli a6, a6, 8
+; RV32I-NEXT: sb a6, 25(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 31(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 30(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 29(a2)
+; RV32I-NEXT: srli a6, t0, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli a6, t0, 16
+; RV32I-NEXT: sb a6, 18(a2)
+; RV32I-NEXT: srli a6, t0, 8
+; RV32I-NEXT: sb a6, 17(a2)
+; RV32I-NEXT: srli a6, a3, 24
+; RV32I-NEXT: sb a6, 23(a2)
+; RV32I-NEXT: srli a6, a3, 16
+; RV32I-NEXT: sb a6, 22(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: srli a3, a5, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, a5, 16
+; RV32I-NEXT: sb a3, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
; RV32I-NEXT: srli a1, a0, 24
-; RV32I-NEXT: sb a1, 91(sp)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 90(sp)
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 89(sp)
-; RV32I-NEXT: sb a1, 87(sp)
-; RV32I-NEXT: sb a3, 86(sp)
-; RV32I-NEXT: sb a0, 85(sp)
-; RV32I-NEXT: sb a1, 83(sp)
-; RV32I-NEXT: sb a3, 82(sp)
-; RV32I-NEXT: sb a0, 81(sp)
-; RV32I-NEXT: sb a1, 79(sp)
-; RV32I-NEXT: sb a3, 78(sp)
-; RV32I-NEXT: sb a0, 77(sp)
-; RV32I-NEXT: sb a1, 75(sp)
-; RV32I-NEXT: sb a3, 74(sp)
-; RV32I-NEXT: sb a0, 73(sp)
-; RV32I-NEXT: sb a1, 71(sp)
-; RV32I-NEXT: sb a3, 70(sp)
-; RV32I-NEXT: sb a0, 69(sp)
-; RV32I-NEXT: sb a1, 67(sp)
-; RV32I-NEXT: sb a3, 66(sp)
-; RV32I-NEXT: sb a0, 65(sp)
-; RV32I-NEXT: sb a1, 63(sp)
-; RV32I-NEXT: sb a3, 62(sp)
-; RV32I-NEXT: sb a0, 61(sp)
-; RV32I-NEXT: andi a0, t0, 31
-; RV32I-NEXT: addi a1, sp, 28
-; RV32I-NEXT: add a6, a1, a0
-; RV32I-NEXT: lbu a0, 6(a6)
-; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 7(a6)
-; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 4(a6)
-; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 5(a6)
-; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 0(a6)
-; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a7, 1(a6)
-; RV32I-NEXT: lbu t0, 2(a6)
-; RV32I-NEXT: lbu t1, 3(a6)
-; RV32I-NEXT: lbu t2, 14(a6)
-; RV32I-NEXT: lbu t3, 15(a6)
-; RV32I-NEXT: lbu t4, 12(a6)
-; RV32I-NEXT: lbu t5, 13(a6)
-; RV32I-NEXT: lbu t6, 10(a6)
-; RV32I-NEXT: lbu s0, 11(a6)
-; RV32I-NEXT: lbu s1, 8(a6)
-; RV32I-NEXT: lbu s2, 9(a6)
-; RV32I-NEXT: lbu s3, 22(a6)
-; RV32I-NEXT: lbu s4, 23(a6)
-; RV32I-NEXT: lbu s5, 20(a6)
-; RV32I-NEXT: lbu s6, 21(a6)
-; RV32I-NEXT: lbu s7, 18(a6)
-; RV32I-NEXT: lbu s8, 19(a6)
-; RV32I-NEXT: lbu s9, 16(a6)
-; RV32I-NEXT: lbu s10, 17(a6)
-; RV32I-NEXT: lbu s11, 30(a6)
-; RV32I-NEXT: lbu ra, 31(a6)
-; RV32I-NEXT: lbu a5, 28(a6)
-; RV32I-NEXT: lbu a4, 29(a6)
-; RV32I-NEXT: lbu a0, 25(a6)
-; RV32I-NEXT: lbu a1, 24(a6)
-; RV32I-NEXT: lbu a3, 27(a6)
-; RV32I-NEXT: lbu a6, 26(a6)
-; RV32I-NEXT: sb a0, 25(a2)
-; RV32I-NEXT: sb a1, 24(a2)
-; RV32I-NEXT: sb a3, 27(a2)
-; RV32I-NEXT: sb a6, 26(a2)
-; RV32I-NEXT: sb a4, 29(a2)
-; RV32I-NEXT: sb a5, 28(a2)
-; RV32I-NEXT: sb ra, 31(a2)
-; RV32I-NEXT: sb s11, 30(a2)
-; RV32I-NEXT: sb s10, 17(a2)
-; RV32I-NEXT: sb s9, 16(a2)
-; RV32I-NEXT: sb s8, 19(a2)
-; RV32I-NEXT: sb s7, 18(a2)
-; RV32I-NEXT: sb s6, 21(a2)
-; RV32I-NEXT: sb s5, 20(a2)
-; RV32I-NEXT: sb s4, 23(a2)
-; RV32I-NEXT: sb s3, 22(a2)
-; RV32I-NEXT: sb s2, 9(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb t5, 13(a2)
-; RV32I-NEXT: sb t4, 12(a2)
-; RV32I-NEXT: sb t3, 15(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t1, 3(a2)
-; RV32I-NEXT: sb t0, 2(a2)
-; RV32I-NEXT: sb a7, 1(a2)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 0(a2)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 6(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: addi sp, sp, 64
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
%res = ashr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index a601256bc2afaa..7e879b137b4f0d 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -704,164 +704,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: lshr_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -64
-; RV32I-NEXT: sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 1(a1)
-; RV32I-NEXT: lbu s1, 0(a1)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 13(a0)
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: or s0, s0, s1
-; RV32I-NEXT: lbu s1, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu a0, 15(a0)
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, s1
-; RV32I-NEXT: or a1, a1, s0
-; RV32I-NEXT: sb zero, 43(sp)
-; RV32I-NEXT: sb zero, 42(sp)
-; RV32I-NEXT: sb zero, 41(sp)
-; RV32I-NEXT: sb zero, 40(sp)
-; RV32I-NEXT: sb zero, 39(sp)
-; RV32I-NEXT: sb zero, 38(sp)
-; RV32I-NEXT: sb zero, 37(sp)
-; RV32I-NEXT: sb zero, 36(sp)
-; RV32I-NEXT: sb zero, 35(sp)
-; RV32I-NEXT: sb zero, 34(sp)
-; RV32I-NEXT: sb zero, 33(sp)
-; RV32I-NEXT: sb zero, 32(sp)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb a0, 27(sp)
-; RV32I-NEXT: sb s4, 26(sp)
-; RV32I-NEXT: sb s3, 25(sp)
-; RV32I-NEXT: sb s2, 24(sp)
-; RV32I-NEXT: sb t6, 23(sp)
-; RV32I-NEXT: sb t5, 22(sp)
-; RV32I-NEXT: sb t4, 21(sp)
-; RV32I-NEXT: sb t3, 20(sp)
-; RV32I-NEXT: sb t2, 19(sp)
-; RV32I-NEXT: sb t1, 18(sp)
-; RV32I-NEXT: sb t0, 17(sp)
-; RV32I-NEXT: sb a7, 16(sp)
-; RV32I-NEXT: sb a6, 15(sp)
-; RV32I-NEXT: sb a5, 14(sp)
-; RV32I-NEXT: sb a4, 13(sp)
-; RV32I-NEXT: sb a3, 12(sp)
-; RV32I-NEXT: slli a0, a1, 25
-; RV32I-NEXT: srli a0, a0, 28
-; RV32I-NEXT: addi a3, sp, 12
-; RV32I-NEXT: add a3, a3, a0
-; RV32I-NEXT: lbu a0, 5(a3)
-; RV32I-NEXT: lbu a4, 4(a3)
-; RV32I-NEXT: lbu a5, 6(a3)
-; RV32I-NEXT: lbu a6, 7(a3)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, a4, a0
-; RV32I-NEXT: andi a4, a1, 7
-; RV32I-NEXT: srl a0, a5, a4
-; RV32I-NEXT: lbu a1, 9(a3)
-; RV32I-NEXT: lbu a6, 8(a3)
-; RV32I-NEXT: lbu a7, 10(a3)
-; RV32I-NEXT: lbu t0, 11(a3)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a6, a6, a1
-; RV32I-NEXT: slli a1, a6, 1
-; RV32I-NEXT: not a7, a4
-; RV32I-NEXT: sll a1, a1, a7
-; RV32I-NEXT: or a1, a0, a1
-; RV32I-NEXT: lbu a7, 1(a3)
-; RV32I-NEXT: lbu t0, 0(a3)
-; RV32I-NEXT: lbu t1, 2(a3)
-; RV32I-NEXT: lbu t2, 3(a3)
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: or a7, a7, t0
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: srl a7, a7, a4
-; RV32I-NEXT: slli a5, a5, 1
-; RV32I-NEXT: xori t0, a4, 31
-; RV32I-NEXT: sll a5, a5, t0
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: srl a6, a6, a4
-; RV32I-NEXT: lbu t1, 13(a3)
-; RV32I-NEXT: lbu t2, 12(a3)
-; RV32I-NEXT: lbu t3, 14(a3)
-; RV32I-NEXT: lbu a3, 15(a3)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: or t1, t1, t2
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a3, a3, t3
-; RV32I-NEXT: or a3, a3, t1
-; RV32I-NEXT: slli t1, a3, 1
-; RV32I-NEXT: sll t0, t1, t0
-; RV32I-NEXT: or t0, a6, t0
-; RV32I-NEXT: srl a3, a3, a4
-; RV32I-NEXT: sb a6, 8(a2)
-; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: srli a4, a6, 16
-; RV32I-NEXT: sb a4, 10(a2)
-; RV32I-NEXT: srli a4, a6, 8
-; RV32I-NEXT: sb a4, 9(a2)
-; RV32I-NEXT: srli a4, a3, 16
-; RV32I-NEXT: sb a4, 14(a2)
-; RV32I-NEXT: srli a4, a3, 24
-; RV32I-NEXT: sb a4, 15(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 13(a2)
-; RV32I-NEXT: srli a3, a7, 16
-; RV32I-NEXT: sb a3, 2(a2)
-; RV32I-NEXT: srli a3, a7, 8
-; RV32I-NEXT: sb a3, 1(a2)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 6(a2)
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a0, a3, a0
+; RV32I-NEXT: lw a3, 4(a0)
+; RV32I-NEXT: srl a4, a3, a1
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: andi a6, a1, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 0(a0)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a4, a4, t0
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: sll a3, a3, a6
+; RV32I-NEXT: or a3, a7, a3
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli a7, a0, 1
+; RV32I-NEXT: sll a6, a7, a6
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: sb a0, 12(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 15(a2)
; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: srli a0, t0, 24
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a3, 0(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a0, a5, 16
+; RV32I-NEXT: sb a0, 10(a2)
+; RV32I-NEXT: srli a0, a5, 24
; RV32I-NEXT: sb a0, 11(a2)
-; RV32I-NEXT: srli a5, a5, 24
-; RV32I-NEXT: sb a5, 3(a2)
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: sb a1, 7(a2)
-; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a0, a3, 16
+; RV32I-NEXT: sb a0, 2(a2)
+; RV32I-NEXT: srli a0, a3, 24
+; RV32I-NEXT: sb a0, 3(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: srli a0, a4, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, a4, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -987,164 +940,117 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: shl_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -64
-; RV32I-NEXT: sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 1(a1)
-; RV32I-NEXT: lbu s1, 0(a1)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 13(a0)
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: or s0, s0, s1
-; RV32I-NEXT: lbu s1, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu a0, 15(a0)
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, s1
-; RV32I-NEXT: or a1, a1, s0
-; RV32I-NEXT: sb zero, 27(sp)
-; RV32I-NEXT: sb zero, 26(sp)
-; RV32I-NEXT: sb zero, 25(sp)
-; RV32I-NEXT: sb zero, 24(sp)
-; RV32I-NEXT: sb zero, 23(sp)
-; RV32I-NEXT: sb zero, 22(sp)
-; RV32I-NEXT: sb zero, 21(sp)
-; RV32I-NEXT: sb zero, 20(sp)
-; RV32I-NEXT: sb zero, 19(sp)
-; RV32I-NEXT: sb zero, 18(sp)
-; RV32I-NEXT: sb zero, 17(sp)
-; RV32I-NEXT: sb zero, 16(sp)
-; RV32I-NEXT: sb zero, 15(sp)
-; RV32I-NEXT: sb zero, 14(sp)
-; RV32I-NEXT: sb zero, 13(sp)
-; RV32I-NEXT: sb zero, 12(sp)
-; RV32I-NEXT: sb a0, 43(sp)
-; RV32I-NEXT: sb s4, 42(sp)
-; RV32I-NEXT: sb s3, 41(sp)
-; RV32I-NEXT: sb s2, 40(sp)
-; RV32I-NEXT: sb t6, 39(sp)
-; RV32I-NEXT: sb t5, 38(sp)
-; RV32I-NEXT: sb t4, 37(sp)
-; RV32I-NEXT: sb t3, 36(sp)
-; RV32I-NEXT: sb t2, 35(sp)
-; RV32I-NEXT: sb t1, 34(sp)
-; RV32I-NEXT: sb t0, 33(sp)
-; RV32I-NEXT: sb a7, 32(sp)
-; RV32I-NEXT: sb a6, 31(sp)
-; RV32I-NEXT: sb a5, 30(sp)
-; RV32I-NEXT: sb a4, 29(sp)
-; RV32I-NEXT: sb a3, 28(sp)
-; RV32I-NEXT: slli a0, a1, 25
-; RV32I-NEXT: srli a0, a0, 28
-; RV32I-NEXT: addi a3, sp, 28
-; RV32I-NEXT: sub a3, a3, a0
-; RV32I-NEXT: lbu a0, 5(a3)
-; RV32I-NEXT: lbu a4, 4(a3)
-; RV32I-NEXT: lbu a5, 6(a3)
-; RV32I-NEXT: lbu a6, 7(a3)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, a4, a0
-; RV32I-NEXT: andi a4, a1, 7
-; RV32I-NEXT: sll a0, a5, a4
-; RV32I-NEXT: lbu a1, 1(a3)
-; RV32I-NEXT: lbu a6, 0(a3)
-; RV32I-NEXT: lbu a7, 2(a3)
-; RV32I-NEXT: lbu t0, 3(a3)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a6, a6, a1
-; RV32I-NEXT: srli a1, a6, 1
-; RV32I-NEXT: xori a7, a4, 31
-; RV32I-NEXT: srl a1, a1, a7
-; RV32I-NEXT: or a1, a0, a1
-; RV32I-NEXT: lbu t0, 13(a3)
-; RV32I-NEXT: lbu t1, 12(a3)
-; RV32I-NEXT: lbu t2, 14(a3)
-; RV32I-NEXT: lbu t3, 15(a3)
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or t0, t0, t1
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: or t1, t3, t2
-; RV32I-NEXT: or t0, t1, t0
-; RV32I-NEXT: sll t0, t0, a4
-; RV32I-NEXT: lbu t1, 9(a3)
-; RV32I-NEXT: lbu t2, 8(a3)
-; RV32I-NEXT: lbu t3, 10(a3)
-; RV32I-NEXT: lbu a3, 11(a3)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: or t1, t1, t2
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a3, a3, t3
-; RV32I-NEXT: or a3, a3, t1
-; RV32I-NEXT: srli t1, a3, 1
-; RV32I-NEXT: srl a7, t1, a7
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: sll a3, a3, a4
-; RV32I-NEXT: srli a5, a5, 1
-; RV32I-NEXT: not t1, a4
-; RV32I-NEXT: srl a5, a5, t1
-; RV32I-NEXT: or a5, a3, a5
-; RV32I-NEXT: sll a4, a6, a4
-; RV32I-NEXT: sb a4, 0(a2)
-; RV32I-NEXT: srli a6, a3, 16
-; RV32I-NEXT: sb a6, 10(a2)
-; RV32I-NEXT: srli a6, a3, 24
-; RV32I-NEXT: sb a6, 11(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 9(a2)
-; RV32I-NEXT: srli a3, t0, 16
-; RV32I-NEXT: sb a3, 14(a2)
-; RV32I-NEXT: srli a3, t0, 24
-; RV32I-NEXT: sb a3, 15(a2)
-; RV32I-NEXT: srli a3, t0, 8
-; RV32I-NEXT: sb a3, 13(a2)
-; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: addi a3, sp, 16
+; RV32I-NEXT: sub a3, a3, a0
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a4, 0(a3)
+; RV32I-NEXT: sll a5, a0, a1
+; RV32I-NEXT: andi a6, a1, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: srli a7, a4, 1
+; RV32I-NEXT: lw t0, 12(a3)
+; RV32I-NEXT: lw a3, 8(a3)
+; RV32I-NEXT: srl a7, a7, a6
+; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: sll a7, t0, a1
+; RV32I-NEXT: srli t0, a3, 1
+; RV32I-NEXT: srl t0, t0, a6
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: sll a3, a3, a1
+; RV32I-NEXT: srli a0, a0, 1
+; RV32I-NEXT: srl a0, a0, a6
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: sll a1, a4, a1
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: srli a3, a1, 16
; RV32I-NEXT: sb a3, 2(a2)
-; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: srli a3, a1, 24
; RV32I-NEXT: sb a3, 3(a2)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 1(a2)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 6(a2)
-; RV32I-NEXT: srli a3, a0, 24
-; RV32I-NEXT: sb a3, 7(a2)
-; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: sb a0, 8(a2)
; RV32I-NEXT: sb a7, 12(a2)
-; RV32I-NEXT: sb a1, 4(a2)
-; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: sb a5, 4(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 10(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 11(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 9(a2)
+; RV32I-NEXT: srli a0, a7, 16
+; RV32I-NEXT: sb a0, 14(a2)
+; RV32I-NEXT: srli a0, a7, 24
+; RV32I-NEXT: sb a0, 15(a2)
+; RV32I-NEXT: srli a0, a7, 8
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: srli a0, a5, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, a5, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1270,171 +1176,118 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: ashr_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -64
-; RV32I-NEXT: sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 15(a0)
-; RV32I-NEXT: slli a4, a3, 24
-; RV32I-NEXT: lbu a5, 0(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 2(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 4(a0)
-; RV32I-NEXT: lbu t2, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t4, 7(a0)
-; RV32I-NEXT: lbu t5, 8(a0)
-; RV32I-NEXT: lbu t6, 9(a0)
-; RV32I-NEXT: lbu s0, 10(a0)
-; RV32I-NEXT: lbu s1, 1(a1)
-; RV32I-NEXT: lbu s2, 0(a1)
-; RV32I-NEXT: lbu s3, 11(a0)
-; RV32I-NEXT: lbu s4, 12(a0)
-; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: or s1, s1, s2
-; RV32I-NEXT: lbu s2, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: lbu s5, 13(a0)
-; RV32I-NEXT: lbu a0, 14(a0)
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, s2
-; RV32I-NEXT: or a1, a1, s1
-; RV32I-NEXT: sb a3, 23(sp)
-; RV32I-NEXT: sb a0, 22(sp)
-; RV32I-NEXT: sb s5, 21(sp)
-; RV32I-NEXT: sb s4, 20(sp)
-; RV32I-NEXT: sb s3, 19(sp)
-; RV32I-NEXT: sb s0, 18(sp)
-; RV32I-NEXT: sb t6, 17(sp)
-; RV32I-NEXT: sb t5, 16(sp)
-; RV32I-NEXT: sb t4, 15(sp)
-; RV32I-NEXT: sb t3, 14(sp)
-; RV32I-NEXT: sb t2, 13(sp)
-; RV32I-NEXT: sb t1, 12(sp)
-; RV32I-NEXT: sb t0, 11(sp)
-; RV32I-NEXT: sb a7, 10(sp)
-; RV32I-NEXT: sb a6, 9(sp)
-; RV32I-NEXT: sb a5, 8(sp)
-; RV32I-NEXT: srai a4, a4, 31
-; RV32I-NEXT: sb a4, 36(sp)
-; RV32I-NEXT: sb a4, 32(sp)
-; RV32I-NEXT: sb a4, 28(sp)
-; RV32I-NEXT: sb a4, 24(sp)
-; RV32I-NEXT: srli a0, a4, 24
-; RV32I-NEXT: sb a0, 39(sp)
-; RV32I-NEXT: srli a3, a4, 16
-; RV32I-NEXT: sb a3, 38(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 37(sp)
-; RV32I-NEXT: sb a0, 35(sp)
-; RV32I-NEXT: sb a3, 34(sp)
-; RV32I-NEXT: sb a4, 33(sp)
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: sb a3, 30(sp)
-; RV32I-NEXT: sb a4, 29(sp)
-; RV32I-NEXT: sb a0, 27(sp)
-; RV32I-NEXT: sb a3, 26(sp)
-; RV32I-NEXT: sb a4, 25(sp)
-; RV32I-NEXT: slli a0, a1, 25
-; RV32I-NEXT: srli a0, a0, 28
-; RV32I-NEXT: addi a3, sp, 8
-; RV32I-NEXT: add a3, a3, a0
-; RV32I-NEXT: lbu a0, 5(a3)
-; RV32I-NEXT: lbu a4, 4(a3)
-; RV32I-NEXT: lbu a5, 6(a3)
-; RV32I-NEXT: lbu a6, 7(a3)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, a4, a0
-; RV32I-NEXT: andi a4, a1, 7
-; RV32I-NEXT: srl a0, a5, a4
-; RV32I-NEXT: lbu a1, 9(a3)
-; RV32I-NEXT: lbu a6, 8(a3)
-; RV32I-NEXT: lbu a7, 10(a3)
-; RV32I-NEXT: lbu t0, 11(a3)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a6, a6, a1
-; RV32I-NEXT: slli a1, a6, 1
-; RV32I-NEXT: not a7, a4
-; RV32I-NEXT: sll a1, a1, a7
-; RV32I-NEXT: or a1, a0, a1
-; RV32I-NEXT: lbu a7, 1(a3)
-; RV32I-NEXT: lbu t0, 0(a3)
-; RV32I-NEXT: lbu t1, 2(a3)
-; RV32I-NEXT: lbu t2, 3(a3)
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a7, a0, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t1, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t0
; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: srl a7, a7, a4
-; RV32I-NEXT: slli a5, a5, 1
-; RV32I-NEXT: xori t0, a4, 31
-; RV32I-NEXT: sll a5, a5, t0
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: srl a6, a6, a4
-; RV32I-NEXT: lbu t1, 13(a3)
-; RV32I-NEXT: lbu t2, 12(a3)
-; RV32I-NEXT: lbu t3, 14(a3)
-; RV32I-NEXT: lbu a3, 15(a3)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: or t1, t1, t2
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a3, a3, t3
-; RV32I-NEXT: or a3, a3, t1
-; RV32I-NEXT: slli t1, a3, 1
-; RV32I-NEXT: sll t0, t1, t0
-; RV32I-NEXT: or t0, a6, t0
-; RV32I-NEXT: sra a3, a3, a4
-; RV32I-NEXT: sb a6, 8(a2)
-; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: srli a4, a6, 16
-; RV32I-NEXT: sb a4, 10(a2)
-; RV32I-NEXT: srli a4, a6, 8
-; RV32I-NEXT: sb a4, 9(a2)
-; RV32I-NEXT: srli a4, a3, 16
-; RV32I-NEXT: sb a4, 14(a2)
-; RV32I-NEXT: srli a4, a3, 24
-; RV32I-NEXT: sb a4, 15(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 13(a2)
-; RV32I-NEXT: srli a3, a7, 16
-; RV32I-NEXT: sb a3, 2(a2)
-; RV32I-NEXT: srli a3, a7, 8
-; RV32I-NEXT: sb a3, 1(a2)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 6(a2)
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a0, a3, a0
+; RV32I-NEXT: lw a3, 4(a0)
+; RV32I-NEXT: srl a4, a3, a1
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: andi a6, a1, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 0(a0)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a4, a4, t0
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: sll a3, a3, a6
+; RV32I-NEXT: or a3, a7, a3
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli a7, a0, 1
+; RV32I-NEXT: sll a6, a7, a6
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: sra a0, a0, a1
+; RV32I-NEXT: sb a0, 12(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 15(a2)
; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: srli a0, t0, 24
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a3, 0(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a0, a5, 16
+; RV32I-NEXT: sb a0, 10(a2)
+; RV32I-NEXT: srli a0, a5, 24
; RV32I-NEXT: sb a0, 11(a2)
-; RV32I-NEXT: srli a5, a5, 24
-; RV32I-NEXT: sb a5, 3(a2)
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: sb a1, 7(a2)
-; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a0, a3, 16
+; RV32I-NEXT: sb a0, 2(a2)
+; RV32I-NEXT: srli a0, a3, 24
+; RV32I-NEXT: sb a0, 3(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: srli a0, a4, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, a4, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1446,191 +1299,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: lshr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -64
; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 1(a1)
-; RV64I-NEXT: lbu s10, 0(a1)
-; RV64I-NEXT: lbu s11, 2(a1)
-; RV64I-NEXT: lbu ra, 3(a1)
-; RV64I-NEXT: slli s9, s9, 8
-; RV64I-NEXT: or s9, s9, s10
-; RV64I-NEXT: slli s11, s11, 16
-; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: lbu s10, 5(a1)
-; RV64I-NEXT: or s11, ra, s11
-; RV64I-NEXT: or s11, s11, s9
-; RV64I-NEXT: lbu s9, 4(a1)
-; RV64I-NEXT: slli s10, s10, 8
-; RV64I-NEXT: lbu ra, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or s10, s10, s9
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: slli ra, ra, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, ra
-; RV64I-NEXT: lbu ra, 22(a0)
-; RV64I-NEXT: or a1, a1, s10
-; RV64I-NEXT: lbu s10, 23(a0)
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or t0, a1, s11
-; RV64I-NEXT: lbu s11, 24(a0)
-; RV64I-NEXT: lbu a7, 25(a0)
-; RV64I-NEXT: lbu a6, 26(a0)
-; RV64I-NEXT: lbu a5, 27(a0)
-; RV64I-NEXT: lbu a1, 31(a0)
-; RV64I-NEXT: lbu a3, 30(a0)
-; RV64I-NEXT: lbu a4, 29(a0)
-; RV64I-NEXT: lbu a0, 28(a0)
-; RV64I-NEXT: sb a1, 87(sp)
-; RV64I-NEXT: sb a3, 86(sp)
-; RV64I-NEXT: sb a4, 85(sp)
-; RV64I-NEXT: sb a0, 84(sp)
-; RV64I-NEXT: sb a5, 83(sp)
-; RV64I-NEXT: sb a6, 82(sp)
-; RV64I-NEXT: sb a7, 81(sp)
-; RV64I-NEXT: sb s11, 80(sp)
-; RV64I-NEXT: sb s10, 79(sp)
-; RV64I-NEXT: sb ra, 78(sp)
-; RV64I-NEXT: sb s9, 77(sp)
-; RV64I-NEXT: sb s8, 76(sp)
-; RV64I-NEXT: sb s7, 75(sp)
-; RV64I-NEXT: sb s6, 74(sp)
-; RV64I-NEXT: sb s5, 73(sp)
-; RV64I-NEXT: sb s4, 72(sp)
-; RV64I-NEXT: sb s3, 71(sp)
-; RV64I-NEXT: sb s2, 70(sp)
-; RV64I-NEXT: sb s1, 69(sp)
-; RV64I-NEXT: sb s0, 68(sp)
-; RV64I-NEXT: sb t6, 67(sp)
-; RV64I-NEXT: sb t5, 66(sp)
-; RV64I-NEXT: sb t4, 65(sp)
-; RV64I-NEXT: sb zero, 119(sp)
-; RV64I-NEXT: sb zero, 118(sp)
-; RV64I-NEXT: sb zero, 117(sp)
-; RV64I-NEXT: sb zero, 116(sp)
-; RV64I-NEXT: sb zero, 115(sp)
-; RV64I-NEXT: sb zero, 114(sp)
-; RV64I-NEXT: sb zero, 113(sp)
-; RV64I-NEXT: sb zero, 112(sp)
-; RV64I-NEXT: sb zero, 111(sp)
-; RV64I-NEXT: sb zero, 110(sp)
-; RV64I-NEXT: sb zero, 109(sp)
-; RV64I-NEXT: sb zero, 108(sp)
-; RV64I-NEXT: sb zero, 107(sp)
-; RV64I-NEXT: sb zero, 106(sp)
-; RV64I-NEXT: sb zero, 105(sp)
-; RV64I-NEXT: sb zero, 104(sp)
-; RV64I-NEXT: sb zero, 103(sp)
-; RV64I-NEXT: sb zero, 102(sp)
-; RV64I-NEXT: sb zero, 101(sp)
-; RV64I-NEXT: sb zero, 100(sp)
-; RV64I-NEXT: sb zero, 99(sp)
-; RV64I-NEXT: sb zero, 98(sp)
-; RV64I-NEXT: sb zero, 97(sp)
-; RV64I-NEXT: sb zero, 96(sp)
-; RV64I-NEXT: sb zero, 95(sp)
-; RV64I-NEXT: sb zero, 94(sp)
-; RV64I-NEXT: sb zero, 93(sp)
-; RV64I-NEXT: sb zero, 92(sp)
-; RV64I-NEXT: sb zero, 91(sp)
-; RV64I-NEXT: sb zero, 90(sp)
-; RV64I-NEXT: sb zero, 89(sp)
-; RV64I-NEXT: sb zero, 88(sp)
-; RV64I-NEXT: sb t3, 64(sp)
-; RV64I-NEXT: sb t2, 63(sp)
-; RV64I-NEXT: sb t1, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: slli a0, t0, 56
-; RV64I-NEXT: srli a0, a0, 59
-; RV64I-NEXT: addi a3, sp, 56
-; RV64I-NEXT: add a3, a3, a0
-; RV64I-NEXT: lbu a0, 9(a3)
-; RV64I-NEXT: lbu a1, 8(a3)
-; RV64I-NEXT: lbu a4, 10(a3)
-; RV64I-NEXT: lbu a5, 11(a3)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: slli a4, a4, 16
-; RV64I-NEXT: slli a5, a5, 24
-; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: or a0, a4, a0
-; RV64I-NEXT: lbu a1, 13(a3)
-; RV64I-NEXT: lbu a4, 12(a3)
-; RV64I-NEXT: lbu a5, 14(a3)
-; RV64I-NEXT: lbu a6, 15(a3)
-; RV64I-NEXT: slli a1, a1, 8
-; RV64I-NEXT: or a1, a1, a4
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a4, a1, a0
-; RV64I-NEXT: andi a1, t0, 7
-; RV64I-NEXT: lbu a0, 17(a3)
-; RV64I-NEXT: lbu a5, 16(a3)
-; RV64I-NEXT: lbu a6, 18(a3)
-; RV64I-NEXT: lbu a7, 19(a3)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
-; RV64I-NEXT: or a0, a5, a0
-; RV64I-NEXT: lbu a5, 21(a3)
-; RV64I-NEXT: lbu a6, 20(a3)
-; RV64I-NEXT: lbu a7, 22(a3)
-; RV64I-NEXT: lbu t0, 23(a3)
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -1638,92 +1343,138 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
-; RV64I-NEXT: or a5, a5, a0
-; RV64I-NEXT: slli a0, a5, 1
-; RV64I-NEXT: not a6, a1
-; RV64I-NEXT: sll a0, a0, a6
-; RV64I-NEXT: lbu a6, 1(a3)
-; RV64I-NEXT: lbu a7, 0(a3)
-; RV64I-NEXT: lbu t0, 2(a3)
-; RV64I-NEXT: lbu t1, 3(a3)
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli t1, t1, 24
; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 5(a3)
-; RV64I-NEXT: lbu t0, 4(a3)
-; RV64I-NEXT: lbu t1, 6(a3)
-; RV64I-NEXT: lbu t2, 7(a3)
-; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or t0, t2, t1
-; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 25(a3)
-; RV64I-NEXT: lbu t0, 24(a3)
-; RV64I-NEXT: lbu t1, 26(a3)
-; RV64I-NEXT: lbu t2, 27(a3)
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or t0, t2, t1
-; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: lbu t0, 29(a3)
-; RV64I-NEXT: lbu t1, 28(a3)
-; RV64I-NEXT: lbu t2, 30(a3)
-; RV64I-NEXT: lbu a3, 31(a3)
-; RV64I-NEXT: slli t0, t0, 8
-; RV64I-NEXT: or t0, t0, t1
-; RV64I-NEXT: slli t2, t2, 16
-; RV64I-NEXT: slli a3, a3, 24
-; RV64I-NEXT: or a3, a3, t2
-; RV64I-NEXT: slli t1, a4, 1
-; RV64I-NEXT: or a3, a3, t0
-; RV64I-NEXT: xori t0, a1, 63
-; RV64I-NEXT: sll t1, t1, t0
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a7, a3, a7
-; RV64I-NEXT: slli a3, a7, 1
-; RV64I-NEXT: sll t0, a3, t0
-; RV64I-NEXT: srl a3, a4, a1
-; RV64I-NEXT: srl a4, a6, a1
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 56(sp)
+; RV64I-NEXT: sd zero, 48(sp)
+; RV64I-NEXT: sd zero, 40(sp)
+; RV64I-NEXT: sd zero, 32(sp)
+; RV64I-NEXT: sd a0, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: srl a0, a4, a1
+; RV64I-NEXT: ld a5, 16(a3)
+; RV64I-NEXT: andi a6, a1, 63
+; RV64I-NEXT: xori a6, a6, 63
+; RV64I-NEXT: ld a7, 0(a3)
+; RV64I-NEXT: slli t0, a5, 1
+; RV64I-NEXT: sll t0, t0, a6
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: srl a7, a7, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a6
+; RV64I-NEXT: or a4, a7, a4
; RV64I-NEXT: srl a5, a5, a1
-; RV64I-NEXT: srl a1, a7, a1
-; RV64I-NEXT: srli a6, a5, 48
-; RV64I-NEXT: sb a6, 22(a2)
-; RV64I-NEXT: srli a6, a5, 40
-; RV64I-NEXT: sb a6, 21(a2)
-; RV64I-NEXT: srli a6, a5, 32
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: srli a6, a5, 24
-; RV64I-NEXT: sb a6, 19(a2)
-; RV64I-NEXT: srli a6, a5, 16
-; RV64I-NEXT: sb a6, 18(a2)
-; RV64I-NEXT: or a6, a5, t0
-; RV64I-NEXT: sb a5, 16(a2)
-; RV64I-NEXT: srli a5, a5, 8
-; RV64I-NEXT: sb a5, 17(a2)
-; RV64I-NEXT: srli a5, a1, 56
-; RV64I-NEXT: sb a5, 31(a2)
-; RV64I-NEXT: srli a5, a1, 48
-; RV64I-NEXT: sb a5, 30(a2)
-; RV64I-NEXT: srli a5, a1, 40
-; RV64I-NEXT: sb a5, 29(a2)
-; RV64I-NEXT: srli a5, a1, 32
-; RV64I-NEXT: sb a5, 28(a2)
-; RV64I-NEXT: srli a5, a1, 24
-; RV64I-NEXT: sb a5, 27(a2)
-; RV64I-NEXT: srli a5, a1, 16
-; RV64I-NEXT: sb a5, 26(a2)
+; RV64I-NEXT: slli a7, a3, 1
+; RV64I-NEXT: sll a6, a7, a6
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: srl a1, a3, a1
; RV64I-NEXT: sb a1, 24(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 30(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 28(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 27(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 26(a2)
; RV64I-NEXT: srli a1, a1, 8
; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb a5, 16(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a1, a5, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a5, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a5, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a5, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a5, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a5, 16
+; RV64I-NEXT: sb a1, 18(a2)
+; RV64I-NEXT: srli a5, a5, 8
+; RV64I-NEXT: sb a5, 17(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
; RV64I-NEXT: srli a1, a4, 48
; RV64I-NEXT: sb a1, 6(a2)
; RV64I-NEXT: srli a1, a4, 40
@@ -1734,366 +1485,234 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: sb a1, 3(a2)
; RV64I-NEXT: srli a1, a4, 16
; RV64I-NEXT: sb a1, 2(a2)
-; RV64I-NEXT: or a1, a4, t1
-; RV64I-NEXT: sb a4, 0(a2)
; RV64I-NEXT: srli a4, a4, 8
; RV64I-NEXT: sb a4, 1(a2)
-; RV64I-NEXT: srli a4, a3, 48
-; RV64I-NEXT: sb a4, 14(a2)
-; RV64I-NEXT: srli a4, a3, 40
-; RV64I-NEXT: sb a4, 13(a2)
-; RV64I-NEXT: srli a4, a3, 32
-; RV64I-NEXT: sb a4, 12(a2)
-; RV64I-NEXT: srli a4, a3, 24
-; RV64I-NEXT: sb a4, 11(a2)
-; RV64I-NEXT: srli a4, a3, 16
-; RV64I-NEXT: sb a4, 10(a2)
-; RV64I-NEXT: or a0, a3, a0
-; RV64I-NEXT: sb a3, 8(a2)
-; RV64I-NEXT: srli a3, a3, 8
-; RV64I-NEXT: sb a3, 9(a2)
-; RV64I-NEXT: srli a3, a6, 56
-; RV64I-NEXT: sb a3, 23(a2)
-; RV64I-NEXT: srli a1, a1, 56
-; RV64I-NEXT: sb a1, 7(a2)
-; RV64I-NEXT: srli a0, a0, 56
-; RV64I-NEXT: sb a0, 15(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
; RV32I-LABEL: lshr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -64
; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
-; RV32I-NEXT: lbu s10, 1(a1)
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s11, 0(a1)
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: or s10, s10, s11
-; RV32I-NEXT: lbu s11, 22(a0)
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, ra
-; RV32I-NEXT: lbu ra, 23(a0)
-; RV32I-NEXT: or t0, a1, s10
-; RV32I-NEXT: lbu s10, 24(a0)
-; RV32I-NEXT: lbu a7, 25(a0)
-; RV32I-NEXT: lbu a6, 26(a0)
-; RV32I-NEXT: lbu a5, 27(a0)
-; RV32I-NEXT: lbu a1, 31(a0)
-; RV32I-NEXT: lbu a3, 30(a0)
-; RV32I-NEXT: lbu a4, 29(a0)
-; RV32I-NEXT: lbu a0, 28(a0)
-; RV32I-NEXT: sb a1, 59(sp)
-; RV32I-NEXT: sb a3, 58(sp)
-; RV32I-NEXT: sb a4, 57(sp)
-; RV32I-NEXT: sb a0, 56(sp)
-; RV32I-NEXT: sb a5, 55(sp)
-; RV32I-NEXT: sb a6, 54(sp)
-; RV32I-NEXT: sb a7, 53(sp)
-; RV32I-NEXT: sb s10, 52(sp)
-; RV32I-NEXT: sb ra, 51(sp)
-; RV32I-NEXT: sb s11, 50(sp)
-; RV32I-NEXT: sb s9, 49(sp)
-; RV32I-NEXT: sb s8, 48(sp)
-; RV32I-NEXT: sb s7, 47(sp)
-; RV32I-NEXT: sb s6, 46(sp)
-; RV32I-NEXT: sb s5, 45(sp)
-; RV32I-NEXT: sb s4, 44(sp)
-; RV32I-NEXT: sb zero, 91(sp)
-; RV32I-NEXT: sb zero, 90(sp)
-; RV32I-NEXT: sb zero, 89(sp)
-; RV32I-NEXT: sb zero, 88(sp)
-; RV32I-NEXT: sb zero, 87(sp)
-; RV32I-NEXT: sb zero, 86(sp)
-; RV32I-NEXT: sb zero, 85(sp)
-; RV32I-NEXT: sb zero, 84(sp)
-; RV32I-NEXT: sb zero, 83(sp)
-; RV32I-NEXT: sb zero, 82(sp)
-; RV32I-NEXT: sb zero, 81(sp)
-; RV32I-NEXT: sb zero, 80(sp)
-; RV32I-NEXT: sb zero, 79(sp)
-; RV32I-NEXT: sb zero, 78(sp)
-; RV32I-NEXT: sb zero, 77(sp)
-; RV32I-NEXT: sb zero, 76(sp)
-; RV32I-NEXT: sb zero, 75(sp)
-; RV32I-NEXT: sb zero, 74(sp)
-; RV32I-NEXT: sb zero, 73(sp)
-; RV32I-NEXT: sb zero, 72(sp)
-; RV32I-NEXT: sb zero, 71(sp)
-; RV32I-NEXT: sb zero, 70(sp)
-; RV32I-NEXT: sb zero, 69(sp)
-; RV32I-NEXT: sb zero, 68(sp)
-; RV32I-NEXT: sb zero, 67(sp)
-; RV32I-NEXT: sb zero, 66(sp)
-; RV32I-NEXT: sb zero, 65(sp)
-; RV32I-NEXT: sb zero, 64(sp)
-; RV32I-NEXT: sb zero, 63(sp)
-; RV32I-NEXT: sb zero, 62(sp)
-; RV32I-NEXT: sb zero, 61(sp)
-; RV32I-NEXT: sb zero, 60(sp)
-; RV32I-NEXT: sb s3, 43(sp)
-; RV32I-NEXT: sb s2, 42(sp)
-; RV32I-NEXT: sb s1, 41(sp)
-; RV32I-NEXT: sb s0, 40(sp)
-; RV32I-NEXT: sb t6, 39(sp)
-; RV32I-NEXT: sb t5, 38(sp)
-; RV32I-NEXT: sb t4, 37(sp)
-; RV32I-NEXT: sb t3, 36(sp)
-; RV32I-NEXT: sb t2, 35(sp)
-; RV32I-NEXT: sb t1, 34(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 33(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 32(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 30(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 29(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 28(sp)
-; RV32I-NEXT: slli a0, t0, 24
-; RV32I-NEXT: srli a0, a0, 27
-; RV32I-NEXT: addi a4, sp, 28
-; RV32I-NEXT: add a4, a4, a0
-; RV32I-NEXT: lbu a0, 5(a4)
-; RV32I-NEXT: lbu a1, 4(a4)
-; RV32I-NEXT: lbu a3, 6(a4)
-; RV32I-NEXT: lbu a5, 7(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: slli a3, a3, 16
-; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or t5, a3, a0
-; RV32I-NEXT: andi a3, t0, 7
-; RV32I-NEXT: lbu a0, 9(a4)
-; RV32I-NEXT: lbu a1, 8(a4)
-; RV32I-NEXT: lbu a5, 10(a4)
-; RV32I-NEXT: lbu a6, 11(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a1, a6, a5
-; RV32I-NEXT: or a6, a1, a0
-; RV32I-NEXT: slli a0, a6, 1
-; RV32I-NEXT: not t1, a3
-; RV32I-NEXT: sll a0, a0, t1
-; RV32I-NEXT: lbu a1, 1(a4)
-; RV32I-NEXT: lbu a5, 0(a4)
-; RV32I-NEXT: lbu a7, 2(a4)
-; RV32I-NEXT: lbu t0, 3(a4)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a5
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or t0, a5, a1
-; RV32I-NEXT: slli a1, t5, 1
-; RV32I-NEXT: xori t2, a3, 31
-; RV32I-NEXT: sll a1, a1, t2
-; RV32I-NEXT: lbu a5, 13(a4)
-; RV32I-NEXT: lbu a7, 12(a4)
-; RV32I-NEXT: lbu t3, 14(a4)
-; RV32I-NEXT: lbu t4, 15(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t1, t1, a7
+; RV32I-NEXT: lbu a7, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t2
; RV32I-NEXT: slli t3, t3, 16
; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t3, a7, a5
-; RV32I-NEXT: lbu a5, 17(a4)
-; RV32I-NEXT: lbu a7, 16(a4)
-; RV32I-NEXT: lbu t4, 18(a4)
-; RV32I-NEXT: lbu t6, 19(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t2, t2, a7
+; RV32I-NEXT: lbu a7, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a7, t6, t4
-; RV32I-NEXT: or t4, a7, a5
-; RV32I-NEXT: slli a5, t4, 1
-; RV32I-NEXT: sll a7, a5, t1
-; RV32I-NEXT: lbu a5, 21(a4)
-; RV32I-NEXT: lbu t6, 20(a4)
-; RV32I-NEXT: lbu s0, 22(a4)
-; RV32I-NEXT: lbu s1, 23(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, t6
-; RV32I-NEXT: slli s0, s0, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or s0, s0, a5
-; RV32I-NEXT: lbu a5, 25(a4)
-; RV32I-NEXT: lbu t6, 24(a4)
-; RV32I-NEXT: lbu s1, 26(a4)
-; RV32I-NEXT: lbu s2, 27(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, t6
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or t6, s2, s1
-; RV32I-NEXT: or t6, t6, a5
-; RV32I-NEXT: lbu a5, 29(a4)
-; RV32I-NEXT: lbu s1, 28(a4)
-; RV32I-NEXT: slli s2, t6, 1
-; RV32I-NEXT: sll t1, s2, t1
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, s1
-; RV32I-NEXT: lbu s1, 30(a4)
-; RV32I-NEXT: lbu a4, 31(a4)
-; RV32I-NEXT: slli s2, t3, 1
-; RV32I-NEXT: sll s2, s2, t2
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: slli s1, s0, 1
-; RV32I-NEXT: sll s1, s1, t2
-; RV32I-NEXT: or s3, a4, a5
-; RV32I-NEXT: slli a4, s3, 1
-; RV32I-NEXT: sll t2, a4, t2
-; RV32I-NEXT: srl a4, t5, a3
-; RV32I-NEXT: srl a5, t0, a3
-; RV32I-NEXT: srl t0, t3, a3
-; RV32I-NEXT: srl a6, a6, a3
-; RV32I-NEXT: srl t3, s0, a3
-; RV32I-NEXT: srl t4, t4, a3
-; RV32I-NEXT: srl t5, t6, a3
-; RV32I-NEXT: srl a3, s3, a3
-; RV32I-NEXT: srli t6, t5, 16
-; RV32I-NEXT: sb t6, 26(a2)
-; RV32I-NEXT: or t2, t5, t2
-; RV32I-NEXT: sb t5, 24(a2)
-; RV32I-NEXT: srli t5, t5, 8
-; RV32I-NEXT: sb t5, 25(a2)
-; RV32I-NEXT: srli t5, a3, 24
-; RV32I-NEXT: sb t5, 31(a2)
-; RV32I-NEXT: srli t5, a3, 16
-; RV32I-NEXT: sb t5, 30(a2)
-; RV32I-NEXT: sb a3, 28(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 29(a2)
-; RV32I-NEXT: srli a3, t4, 16
-; RV32I-NEXT: sb a3, 18(a2)
-; RV32I-NEXT: or a3, t4, s1
-; RV32I-NEXT: sb t4, 16(a2)
-; RV32I-NEXT: srli t4, t4, 8
-; RV32I-NEXT: sb t4, 17(a2)
-; RV32I-NEXT: srli t4, t3, 16
-; RV32I-NEXT: sb t4, 22(a2)
-; RV32I-NEXT: or t1, t3, t1
-; RV32I-NEXT: sb t3, 20(a2)
-; RV32I-NEXT: srli t3, t3, 8
-; RV32I-NEXT: sb t3, 21(a2)
-; RV32I-NEXT: srli t3, a6, 16
-; RV32I-NEXT: sb t3, 10(a2)
-; RV32I-NEXT: or t3, a6, s2
-; RV32I-NEXT: sb a6, 8(a2)
-; RV32I-NEXT: srli a6, a6, 8
-; RV32I-NEXT: sb a6, 9(a2)
-; RV32I-NEXT: srli a6, t0, 16
-; RV32I-NEXT: sb a6, 14(a2)
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: sb t0, 12(a2)
-; RV32I-NEXT: srli a7, t0, 8
-; RV32I-NEXT: sb a7, 13(a2)
-; RV32I-NEXT: srli a7, a5, 16
-; RV32I-NEXT: sb a7, 2(a2)
-; RV32I-NEXT: or a1, a5, a1
-; RV32I-NEXT: sb a5, 0(a2)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 1(a2)
-; RV32I-NEXT: srli a5, a4, 16
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: or a0, a4, a0
-; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, a7
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t3, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: or a7, a1, a7
+; RV32I-NEXT: sw zero, 60(sp)
+; RV32I-NEXT: sw zero, 56(sp)
+; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 44(sp)
+; RV32I-NEXT: sw zero, 40(sp)
+; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw t2, 24(sp)
+; RV32I-NEXT: sw t1, 20(sp)
+; RV32I-NEXT: sw t0, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a0, a7, 3
+; RV32I-NEXT: andi a0, a0, 28
+; RV32I-NEXT: mv a1, sp
+; RV32I-NEXT: add a4, a1, a0
+; RV32I-NEXT: lw a1, 4(a4)
+; RV32I-NEXT: srl a0, a1, a7
+; RV32I-NEXT: lw a5, 8(a4)
+; RV32I-NEXT: andi a3, a7, 31
+; RV32I-NEXT: xori a6, a3, 31
+; RV32I-NEXT: lw a3, 0(a4)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: srl a3, a3, a7
+; RV32I-NEXT: slli a1, a1, 1
+; RV32I-NEXT: lw t0, 12(a4)
+; RV32I-NEXT: lw t1, 16(a4)
+; RV32I-NEXT: sll a1, a1, a6
+; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: srl a3, t0, a7
+; RV32I-NEXT: slli t2, t1, 1
+; RV32I-NEXT: sll t2, t2, a6
+; RV32I-NEXT: or a3, a3, t2
+; RV32I-NEXT: srl a5, a5, a7
+; RV32I-NEXT: slli t0, t0, 1
+; RV32I-NEXT: lw t2, 20(a4)
+; RV32I-NEXT: lw t3, 24(a4)
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a5, a5, t0
+; RV32I-NEXT: srl t0, t2, a7
+; RV32I-NEXT: slli t4, t3, 1
+; RV32I-NEXT: sll t4, t4, a6
+; RV32I-NEXT: or t0, t0, t4
+; RV32I-NEXT: srl t1, t1, a7
+; RV32I-NEXT: slli t2, t2, 1
+; RV32I-NEXT: lw a4, 28(a4)
+; RV32I-NEXT: sll t2, t2, a6
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: srl t2, t3, a7
+; RV32I-NEXT: slli t3, a4, 1
+; RV32I-NEXT: sll a6, t3, a6
+; RV32I-NEXT: or a6, t2, a6
+; RV32I-NEXT: srl a4, a4, a7
+; RV32I-NEXT: sb a4, 28(a2)
+; RV32I-NEXT: srli a7, a4, 24
+; RV32I-NEXT: sb a7, 31(a2)
+; RV32I-NEXT: srli a7, a4, 16
+; RV32I-NEXT: sb a7, 30(a2)
; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 5(a2)
-; RV32I-NEXT: srli a4, t2, 24
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb t1, 16(a2)
+; RV32I-NEXT: sb t0, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a3, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a4, a6, 24
; RV32I-NEXT: sb a4, 27(a2)
-; RV32I-NEXT: srli a3, a3, 24
-; RV32I-NEXT: sb a3, 19(a2)
-; RV32I-NEXT: srli a3, t1, 24
-; RV32I-NEXT: sb a3, 23(a2)
-; RV32I-NEXT: srli a3, t3, 24
-; RV32I-NEXT: sb a3, 11(a2)
-; RV32I-NEXT: srli a3, a6, 24
-; RV32I-NEXT: sb a3, 15(a2)
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: sb a1, 3(a2)
-; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: srli a4, a6, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, a6, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, t1, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t1, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t1, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 21(a2)
+; RV32I-NEXT: srli a4, a5, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 15(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 14(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2104,191 +1723,43 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: shl_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -64
; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 1(a1)
-; RV64I-NEXT: lbu s10, 0(a1)
-; RV64I-NEXT: lbu s11, 2(a1)
-; RV64I-NEXT: lbu ra, 3(a1)
-; RV64I-NEXT: slli s9, s9, 8
-; RV64I-NEXT: or s9, s9, s10
-; RV64I-NEXT: slli s11, s11, 16
-; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: lbu s10, 5(a1)
-; RV64I-NEXT: or s11, ra, s11
-; RV64I-NEXT: or s11, s11, s9
-; RV64I-NEXT: lbu s9, 4(a1)
-; RV64I-NEXT: slli s10, s10, 8
-; RV64I-NEXT: lbu ra, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or s10, s10, s9
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: slli ra, ra, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, ra
-; RV64I-NEXT: lbu ra, 22(a0)
-; RV64I-NEXT: or a1, a1, s10
-; RV64I-NEXT: lbu s10, 23(a0)
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or t0, a1, s11
-; RV64I-NEXT: lbu s11, 24(a0)
-; RV64I-NEXT: lbu a7, 25(a0)
-; RV64I-NEXT: lbu a6, 26(a0)
-; RV64I-NEXT: lbu a5, 27(a0)
-; RV64I-NEXT: lbu a1, 31(a0)
-; RV64I-NEXT: lbu a3, 30(a0)
-; RV64I-NEXT: lbu a4, 29(a0)
-; RV64I-NEXT: lbu a0, 28(a0)
-; RV64I-NEXT: sb a1, 119(sp)
-; RV64I-NEXT: sb a3, 118(sp)
-; RV64I-NEXT: sb a4, 117(sp)
-; RV64I-NEXT: sb a0, 116(sp)
-; RV64I-NEXT: sb a5, 115(sp)
-; RV64I-NEXT: sb a6, 114(sp)
-; RV64I-NEXT: sb a7, 113(sp)
-; RV64I-NEXT: sb s11, 112(sp)
-; RV64I-NEXT: sb s10, 111(sp)
-; RV64I-NEXT: sb ra, 110(sp)
-; RV64I-NEXT: sb s9, 109(sp)
-; RV64I-NEXT: sb s8, 108(sp)
-; RV64I-NEXT: sb s7, 107(sp)
-; RV64I-NEXT: sb s6, 106(sp)
-; RV64I-NEXT: sb s5, 105(sp)
-; RV64I-NEXT: sb s4, 104(sp)
-; RV64I-NEXT: sb s3, 103(sp)
-; RV64I-NEXT: sb s2, 102(sp)
-; RV64I-NEXT: sb s1, 101(sp)
-; RV64I-NEXT: sb s0, 100(sp)
-; RV64I-NEXT: sb t6, 99(sp)
-; RV64I-NEXT: sb t5, 98(sp)
-; RV64I-NEXT: sb t4, 97(sp)
-; RV64I-NEXT: sb t3, 96(sp)
-; RV64I-NEXT: sb zero, 87(sp)
-; RV64I-NEXT: sb zero, 86(sp)
-; RV64I-NEXT: sb zero, 85(sp)
-; RV64I-NEXT: sb zero, 84(sp)
-; RV64I-NEXT: sb zero, 83(sp)
-; RV64I-NEXT: sb zero, 82(sp)
-; RV64I-NEXT: sb zero, 81(sp)
-; RV64I-NEXT: sb zero, 80(sp)
-; RV64I-NEXT: sb zero, 79(sp)
-; RV64I-NEXT: sb zero, 78(sp)
-; RV64I-NEXT: sb zero, 77(sp)
-; RV64I-NEXT: sb zero, 76(sp)
-; RV64I-NEXT: sb zero, 75(sp)
-; RV64I-NEXT: sb zero, 74(sp)
-; RV64I-NEXT: sb zero, 73(sp)
-; RV64I-NEXT: sb zero, 72(sp)
-; RV64I-NEXT: sb zero, 71(sp)
-; RV64I-NEXT: sb zero, 70(sp)
-; RV64I-NEXT: sb zero, 69(sp)
-; RV64I-NEXT: sb zero, 68(sp)
-; RV64I-NEXT: sb zero, 67(sp)
-; RV64I-NEXT: sb zero, 66(sp)
-; RV64I-NEXT: sb zero, 65(sp)
-; RV64I-NEXT: sb zero, 64(sp)
-; RV64I-NEXT: sb zero, 63(sp)
-; RV64I-NEXT: sb zero, 62(sp)
-; RV64I-NEXT: sb zero, 61(sp)
-; RV64I-NEXT: sb zero, 60(sp)
-; RV64I-NEXT: sb zero, 59(sp)
-; RV64I-NEXT: sb zero, 58(sp)
-; RV64I-NEXT: sb zero, 57(sp)
-; RV64I-NEXT: sb zero, 56(sp)
-; RV64I-NEXT: sb t2, 95(sp)
-; RV64I-NEXT: sb t1, 94(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 93(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 92(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 91(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 90(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 88(sp)
-; RV64I-NEXT: slli a0, t0, 56
-; RV64I-NEXT: srli a0, a0, 59
-; RV64I-NEXT: addi a1, sp, 88
-; RV64I-NEXT: sub a0, a1, a0
-; RV64I-NEXT: lbu a1, 9(a0)
-; RV64I-NEXT: lbu a3, 8(a0)
-; RV64I-NEXT: lbu a4, 10(a0)
-; RV64I-NEXT: lbu a5, 11(a0)
-; RV64I-NEXT: slli a1, a1, 8
-; RV64I-NEXT: or a1, a1, a3
-; RV64I-NEXT: slli a4, a4, 16
-; RV64I-NEXT: slli a5, a5, 24
-; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: lbu a3, 13(a0)
-; RV64I-NEXT: lbu a4, 12(a0)
-; RV64I-NEXT: lbu a5, 14(a0)
-; RV64I-NEXT: lbu a6, 15(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a3, a3, a1
-; RV64I-NEXT: andi a1, t0, 7
-; RV64I-NEXT: lbu a4, 1(a0)
-; RV64I-NEXT: lbu a5, 0(a0)
-; RV64I-NEXT: lbu a6, 2(a0)
-; RV64I-NEXT: lbu a7, 3(a0)
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 5(a0)
-; RV64I-NEXT: lbu a6, 4(a0)
-; RV64I-NEXT: lbu a7, 6(a0)
-; RV64I-NEXT: lbu t0, 7(a0)
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -2297,20 +1768,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 25(a0)
-; RV64I-NEXT: lbu a6, 24(a0)
-; RV64I-NEXT: lbu a7, 26(a0)
-; RV64I-NEXT: lbu t0, 27(a0)
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: slli t0, t0, 24
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 29(a0)
-; RV64I-NEXT: lbu a7, 28(a0)
-; RV64I-NEXT: lbu t0, 30(a0)
-; RV64I-NEXT: lbu t1, 31(a0)
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
@@ -2319,439 +1790,353 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: slli a6, a6, 32
; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 17(a0)
-; RV64I-NEXT: lbu a7, 16(a0)
-; RV64I-NEXT: lbu t0, 18(a0)
-; RV64I-NEXT: lbu t1, 19(a0)
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: lbu a7, 21(a0)
-; RV64I-NEXT: or t0, t1, t0
-; RV64I-NEXT: or a6, t0, a6
-; RV64I-NEXT: lbu t0, 20(a0)
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: lbu t1, 22(a0)
-; RV64I-NEXT: lbu a0, 23(a0)
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: srli t0, a4, 1
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or t1, a0, t1
-; RV64I-NEXT: xori t2, a1, 63
-; RV64I-NEXT: srl a0, t0, t2
-; RV64I-NEXT: or a7, t1, a7
-; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: srli a7, a6, 1
-; RV64I-NEXT: srl a7, a7, t2
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 24(sp)
+; RV64I-NEXT: sd zero, 16(sp)
+; RV64I-NEXT: sd zero, 8(sp)
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a5, 48(sp)
+; RV64I-NEXT: sd a4, 40(sp)
+; RV64I-NEXT: sd a3, 32(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: addi a3, sp, 32
+; RV64I-NEXT: sub a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: ld a5, 0(a3)
+; RV64I-NEXT: sll a0, a4, a1
+; RV64I-NEXT: andi a6, a1, 63
+; RV64I-NEXT: xori a6, a6, 63
+; RV64I-NEXT: srli a7, a5, 1
+; RV64I-NEXT: ld t0, 24(a3)
+; RV64I-NEXT: ld a3, 16(a3)
+; RV64I-NEXT: srl a7, a7, a6
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: sll a7, t0, a1
; RV64I-NEXT: srli t0, a3, 1
-; RV64I-NEXT: not t1, a1
-; RV64I-NEXT: srl t0, t0, t1
+; RV64I-NEXT: srl t0, t0, a6
+; RV64I-NEXT: or a7, a7, t0
; RV64I-NEXT: sll a3, a3, a1
-; RV64I-NEXT: sll a5, a5, a1
-; RV64I-NEXT: sll a6, a6, a1
-; RV64I-NEXT: sll a1, a4, a1
-; RV64I-NEXT: srli a4, a6, 56
-; RV64I-NEXT: sb a4, 23(a2)
-; RV64I-NEXT: srli a4, a6, 48
-; RV64I-NEXT: sb a4, 22(a2)
-; RV64I-NEXT: srli a4, a6, 40
-; RV64I-NEXT: sb a4, 21(a2)
-; RV64I-NEXT: srli a4, a6, 32
-; RV64I-NEXT: sb a4, 20(a2)
-; RV64I-NEXT: srli a4, a6, 24
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: srli a4, a6, 16
-; RV64I-NEXT: sb a4, 18(a2)
-; RV64I-NEXT: or a4, a6, t0
-; RV64I-NEXT: srli a6, a6, 8
-; RV64I-NEXT: sb a6, 17(a2)
-; RV64I-NEXT: srli a6, a5, 56
-; RV64I-NEXT: sb a6, 31(a2)
-; RV64I-NEXT: srli a6, a5, 48
-; RV64I-NEXT: sb a6, 30(a2)
-; RV64I-NEXT: srli a6, a5, 40
-; RV64I-NEXT: sb a6, 29(a2)
-; RV64I-NEXT: srli a6, a5, 32
-; RV64I-NEXT: sb a6, 28(a2)
-; RV64I-NEXT: srli a6, a5, 24
-; RV64I-NEXT: sb a6, 27(a2)
-; RV64I-NEXT: srli a6, a5, 16
-; RV64I-NEXT: sb a6, 26(a2)
-; RV64I-NEXT: or a6, a5, a7
-; RV64I-NEXT: srli a5, a5, 8
-; RV64I-NEXT: sb a5, 25(a2)
-; RV64I-NEXT: srli a5, a1, 56
-; RV64I-NEXT: sb a5, 7(a2)
-; RV64I-NEXT: srli a5, a1, 48
-; RV64I-NEXT: sb a5, 6(a2)
-; RV64I-NEXT: srli a5, a1, 40
-; RV64I-NEXT: sb a5, 5(a2)
-; RV64I-NEXT: srli a5, a1, 32
-; RV64I-NEXT: sb a5, 4(a2)
-; RV64I-NEXT: srli a5, a1, 24
-; RV64I-NEXT: sb a5, 3(a2)
-; RV64I-NEXT: srli a5, a1, 16
-; RV64I-NEXT: sb a5, 2(a2)
+; RV64I-NEXT: srli a4, a4, 1
+; RV64I-NEXT: srl a4, a4, a6
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: sll a1, a5, a1
; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: srli a4, a1, 56
+; RV64I-NEXT: sb a4, 7(a2)
+; RV64I-NEXT: srli a4, a1, 48
+; RV64I-NEXT: sb a4, 6(a2)
+; RV64I-NEXT: srli a4, a1, 40
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: srli a4, a1, 32
+; RV64I-NEXT: sb a4, 4(a2)
+; RV64I-NEXT: srli a4, a1, 24
+; RV64I-NEXT: sb a4, 3(a2)
+; RV64I-NEXT: srli a4, a1, 16
+; RV64I-NEXT: sb a4, 2(a2)
; RV64I-NEXT: srli a1, a1, 8
; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: sb a3, 16(a2)
+; RV64I-NEXT: sb a7, 24(a2)
+; RV64I-NEXT: sb a0, 8(a2)
; RV64I-NEXT: srli a1, a3, 56
-; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: sb a1, 23(a2)
; RV64I-NEXT: srli a1, a3, 48
-; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: sb a1, 22(a2)
; RV64I-NEXT: srli a1, a3, 40
-; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: sb a1, 21(a2)
; RV64I-NEXT: srli a1, a3, 32
-; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: sb a1, 20(a2)
; RV64I-NEXT: srli a1, a3, 24
-; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: sb a1, 19(a2)
; RV64I-NEXT: srli a1, a3, 16
-; RV64I-NEXT: sb a1, 10(a2)
-; RV64I-NEXT: or a0, a3, a0
+; RV64I-NEXT: sb a1, 18(a2)
; RV64I-NEXT: srli a3, a3, 8
-; RV64I-NEXT: sb a3, 9(a2)
-; RV64I-NEXT: sb a4, 16(a2)
-; RV64I-NEXT: sb a6, 24(a2)
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: srli a1, a7, 56
+; RV64I-NEXT: sb a1, 31(a2)
+; RV64I-NEXT: srli a1, a7, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, a7, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, a7, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, a7, 24
+; RV64I-NEXT: sb a1, 27(a2)
+; RV64I-NEXT: srli a1, a7, 16
+; RV64I-NEXT: sb a1, 26(a2)
+; RV64I-NEXT: srli a1, a7, 8
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
; RV32I-LABEL: shl_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -64
; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
-; RV32I-NEXT: lbu s10, 1(a1)
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s11, 0(a1)
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: or s10, s10, s11
-; RV32I-NEXT: lbu s11, 22(a0)
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, ra
-; RV32I-NEXT: lbu ra, 23(a0)
-; RV32I-NEXT: or t0, a1, s10
-; RV32I-NEXT: lbu s10, 24(a0)
-; RV32I-NEXT: lbu a7, 25(a0)
-; RV32I-NEXT: lbu a6, 26(a0)
-; RV32I-NEXT: lbu a5, 27(a0)
-; RV32I-NEXT: lbu a1, 31(a0)
-; RV32I-NEXT: lbu a3, 30(a0)
-; RV32I-NEXT: lbu a4, 29(a0)
-; RV32I-NEXT: lbu a0, 28(a0)
-; RV32I-NEXT: sb a1, 91(sp)
-; RV32I-NEXT: sb a3, 90(sp)
-; RV32I-NEXT: sb a4, 89(sp)
-; RV32I-NEXT: sb a0, 88(sp)
-; RV32I-NEXT: sb a5, 87(sp)
-; RV32I-NEXT: sb a6, 86(sp)
-; RV32I-NEXT: sb a7, 85(sp)
-; RV32I-NEXT: sb s10, 84(sp)
-; RV32I-NEXT: sb ra, 83(sp)
-; RV32I-NEXT: sb s11, 82(sp)
-; RV32I-NEXT: sb s9, 81(sp)
-; RV32I-NEXT: sb s8, 80(sp)
-; RV32I-NEXT: sb s7, 79(sp)
-; RV32I-NEXT: sb s6, 78(sp)
-; RV32I-NEXT: sb s5, 77(sp)
-; RV32I-NEXT: sb s4, 76(sp)
-; RV32I-NEXT: sb zero, 59(sp)
-; RV32I-NEXT: sb zero, 58(sp)
-; RV32I-NEXT: sb zero, 57(sp)
-; RV32I-NEXT: sb zero, 56(sp)
-; RV32I-NEXT: sb zero, 55(sp)
-; RV32I-NEXT: sb zero, 54(sp)
-; RV32I-NEXT: sb zero, 53(sp)
-; RV32I-NEXT: sb zero, 52(sp)
-; RV32I-NEXT: sb zero, 51(sp)
-; RV32I-NEXT: sb zero, 50(sp)
-; RV32I-NEXT: sb zero, 49(sp)
-; RV32I-NEXT: sb zero, 48(sp)
-; RV32I-NEXT: sb zero, 47(sp)
-; RV32I-NEXT: sb zero, 46(sp)
-; RV32I-NEXT: sb zero, 45(sp)
-; RV32I-NEXT: sb zero, 44(sp)
-; RV32I-NEXT: sb zero, 43(sp)
-; RV32I-NEXT: sb zero, 42(sp)
-; RV32I-NEXT: sb zero, 41(sp)
-; RV32I-NEXT: sb zero, 40(sp)
-; RV32I-NEXT: sb zero, 39(sp)
-; RV32I-NEXT: sb zero, 38(sp)
-; RV32I-NEXT: sb zero, 37(sp)
-; RV32I-NEXT: sb zero, 36(sp)
-; RV32I-NEXT: sb zero, 35(sp)
-; RV32I-NEXT: sb zero, 34(sp)
-; RV32I-NEXT: sb zero, 33(sp)
-; RV32I-NEXT: sb zero, 32(sp)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb s3, 75(sp)
-; RV32I-NEXT: sb s2, 74(sp)
-; RV32I-NEXT: sb s1, 73(sp)
-; RV32I-NEXT: sb s0, 72(sp)
-; RV32I-NEXT: sb t6, 71(sp)
-; RV32I-NEXT: sb t5, 70(sp)
-; RV32I-NEXT: sb t4, 69(sp)
-; RV32I-NEXT: sb t3, 68(sp)
-; RV32I-NEXT: sb t2, 67(sp)
-; RV32I-NEXT: sb t1, 66(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 65(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 64(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 63(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 62(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 61(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 60(sp)
-; RV32I-NEXT: slli a0, t0, 24
-; RV32I-NEXT: srli a0, a0, 27
-; RV32I-NEXT: addi a4, sp, 60
-; RV32I-NEXT: sub a4, a4, a0
-; RV32I-NEXT: lbu a0, 5(a4)
-; RV32I-NEXT: lbu a1, 4(a4)
-; RV32I-NEXT: lbu a3, 6(a4)
-; RV32I-NEXT: lbu a5, 7(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: slli a3, a3, 16
-; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or t5, a3, a0
-; RV32I-NEXT: andi a1, t0, 7
-; RV32I-NEXT: lbu a0, 1(a4)
-; RV32I-NEXT: lbu a3, 0(a4)
-; RV32I-NEXT: lbu a5, 2(a4)
-; RV32I-NEXT: lbu a6, 3(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a3, a6, a5
-; RV32I-NEXT: or a6, a3, a0
-; RV32I-NEXT: srli a0, a6, 1
-; RV32I-NEXT: xori a7, a1, 31
-; RV32I-NEXT: srl a0, a0, a7
-; RV32I-NEXT: lbu a3, 13(a4)
-; RV32I-NEXT: lbu a5, 12(a4)
-; RV32I-NEXT: lbu t0, 14(a4)
-; RV32I-NEXT: lbu t1, 15(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a5
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a5, t1, t0
-; RV32I-NEXT: or t0, a5, a3
-; RV32I-NEXT: lbu a3, 9(a4)
-; RV32I-NEXT: lbu a5, 8(a4)
-; RV32I-NEXT: lbu t1, 10(a4)
-; RV32I-NEXT: lbu t2, 11(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a5, t2, t1
-; RV32I-NEXT: or t1, a5, a3
-; RV32I-NEXT: srli a3, t1, 1
-; RV32I-NEXT: srl a5, a3, a7
-; RV32I-NEXT: srli t4, t5, 1
-; RV32I-NEXT: not t2, a1
-; RV32I-NEXT: lbu a3, 21(a4)
-; RV32I-NEXT: lbu t3, 20(a4)
-; RV32I-NEXT: lbu t6, 22(a4)
-; RV32I-NEXT: lbu s0, 23(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, t3
-; RV32I-NEXT: slli t6, t6, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or t3, s0, t6
-; RV32I-NEXT: or t3, t3, a3
-; RV32I-NEXT: lbu a3, 17(a4)
-; RV32I-NEXT: lbu t6, 16(a4)
-; RV32I-NEXT: lbu s0, 18(a4)
-; RV32I-NEXT: lbu s1, 19(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, t6
-; RV32I-NEXT: slli s0, s0, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or s0, s0, a3
-; RV32I-NEXT: lbu a3, 29(a4)
-; RV32I-NEXT: lbu t6, 28(a4)
-; RV32I-NEXT: lbu s1, 30(a4)
-; RV32I-NEXT: lbu s2, 31(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, t6
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or t6, s2, s1
-; RV32I-NEXT: lbu s1, 25(a4)
-; RV32I-NEXT: lbu s2, 24(a4)
-; RV32I-NEXT: srl t4, t4, t2
-; RV32I-NEXT: or t6, t6, a3
-; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: or a3, s1, s2
-; RV32I-NEXT: lbu s1, 26(a4)
-; RV32I-NEXT: lbu a4, 27(a4)
-; RV32I-NEXT: srli s2, s0, 1
-; RV32I-NEXT: srl s2, s2, a7
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: srli s1, t0, 1
-; RV32I-NEXT: srl s1, s1, t2
-; RV32I-NEXT: or a4, a4, a3
-; RV32I-NEXT: srli a3, a4, 1
-; RV32I-NEXT: srl a7, a3, a7
-; RV32I-NEXT: srli a3, t3, 1
-; RV32I-NEXT: srl t2, a3, t2
-; RV32I-NEXT: sll a3, t5, a1
-; RV32I-NEXT: sll t0, t0, a1
-; RV32I-NEXT: sll t1, t1, a1
-; RV32I-NEXT: sll t3, t3, a1
-; RV32I-NEXT: sll t5, s0, a1
-; RV32I-NEXT: sll t6, t6, a1
-; RV32I-NEXT: sll a4, a4, a1
-; RV32I-NEXT: sll a1, a6, a1
-; RV32I-NEXT: srli a6, a4, 24
-; RV32I-NEXT: sb a6, 27(a2)
-; RV32I-NEXT: srli a6, a4, 16
-; RV32I-NEXT: sb a6, 26(a2)
-; RV32I-NEXT: or a6, a4, t2
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t1, t1, a7
+; RV32I-NEXT: lbu a7, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t2, t2, a7
+; RV32I-NEXT: lbu a7, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, a7
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t3, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: or a7, a1, a7
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw t2, 56(sp)
+; RV32I-NEXT: sw t1, 52(sp)
+; RV32I-NEXT: sw t0, 48(sp)
+; RV32I-NEXT: sw a6, 44(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: srli a0, a7, 3
+; RV32I-NEXT: andi a0, a0, 28
+; RV32I-NEXT: addi a1, sp, 32
+; RV32I-NEXT: sub a4, a1, a0
+; RV32I-NEXT: lw a3, 4(a4)
+; RV32I-NEXT: lw a5, 0(a4)
+; RV32I-NEXT: sll a0, a3, a7
+; RV32I-NEXT: andi a1, a7, 31
+; RV32I-NEXT: xori a6, a1, 31
+; RV32I-NEXT: srli a1, a5, 1
+; RV32I-NEXT: lw t0, 12(a4)
+; RV32I-NEXT: lw t1, 8(a4)
+; RV32I-NEXT: srl a1, a1, a6
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: sll a1, t0, a7
+; RV32I-NEXT: srli t2, t1, 1
+; RV32I-NEXT: srl t2, t2, a6
+; RV32I-NEXT: or a1, a1, t2
+; RV32I-NEXT: sll t1, t1, a7
+; RV32I-NEXT: srli a3, a3, 1
+; RV32I-NEXT: lw t2, 20(a4)
+; RV32I-NEXT: lw t3, 16(a4)
+; RV32I-NEXT: srl a3, a3, a6
+; RV32I-NEXT: or a3, t1, a3
+; RV32I-NEXT: sll t1, t2, a7
+; RV32I-NEXT: srli t4, t3, 1
+; RV32I-NEXT: srl t4, t4, a6
+; RV32I-NEXT: or t1, t1, t4
+; RV32I-NEXT: sll t3, t3, a7
+; RV32I-NEXT: srli t0, t0, 1
+; RV32I-NEXT: lw t4, 28(a4)
+; RV32I-NEXT: lw a4, 24(a4)
+; RV32I-NEXT: srl t0, t0, a6
+; RV32I-NEXT: or t0, t3, t0
+; RV32I-NEXT: sll t3, t4, a7
+; RV32I-NEXT: srli t4, a4, 1
+; RV32I-NEXT: srl t4, t4, a6
+; RV32I-NEXT: or t3, t3, t4
+; RV32I-NEXT: sll a4, a4, a7
+; RV32I-NEXT: srli t2, t2, 1
+; RV32I-NEXT: srl a6, t2, a6
+; RV32I-NEXT: or a4, a4, a6
+; RV32I-NEXT: sll a5, a5, a7
+; RV32I-NEXT: sb a5, 0(a2)
+; RV32I-NEXT: srli a6, a5, 24
+; RV32I-NEXT: sb a6, 3(a2)
+; RV32I-NEXT: srli a6, a5, 16
+; RV32I-NEXT: sb a6, 2(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 1(a2)
+; RV32I-NEXT: sb a4, 24(a2)
+; RV32I-NEXT: sb t3, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb t1, 20(a2)
+; RV32I-NEXT: sb a3, 8(a2)
+; RV32I-NEXT: sb a1, 12(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a5, a4, 24
+; RV32I-NEXT: sb a5, 27(a2)
+; RV32I-NEXT: srli a5, a4, 16
+; RV32I-NEXT: sb a5, 26(a2)
; RV32I-NEXT: srli a4, a4, 8
; RV32I-NEXT: sb a4, 25(a2)
-; RV32I-NEXT: srli a4, t6, 24
+; RV32I-NEXT: srli a4, t3, 24
; RV32I-NEXT: sb a4, 31(a2)
-; RV32I-NEXT: srli a4, t6, 16
+; RV32I-NEXT: srli a4, t3, 16
; RV32I-NEXT: sb a4, 30(a2)
-; RV32I-NEXT: or a4, t6, a7
-; RV32I-NEXT: srli a7, t6, 8
-; RV32I-NEXT: sb a7, 29(a2)
-; RV32I-NEXT: srli a7, t5, 24
-; RV32I-NEXT: sb a7, 19(a2)
-; RV32I-NEXT: srli a7, t5, 16
-; RV32I-NEXT: sb a7, 18(a2)
-; RV32I-NEXT: or a7, t5, s1
-; RV32I-NEXT: srli t2, t5, 8
-; RV32I-NEXT: sb t2, 17(a2)
-; RV32I-NEXT: srli t2, t3, 24
-; RV32I-NEXT: sb t2, 23(a2)
-; RV32I-NEXT: srli t2, t3, 16
-; RV32I-NEXT: sb t2, 22(a2)
-; RV32I-NEXT: or t2, t3, s2
-; RV32I-NEXT: srli t3, t3, 8
-; RV32I-NEXT: sb t3, 21(a2)
-; RV32I-NEXT: srli t3, t1, 24
-; RV32I-NEXT: sb t3, 11(a2)
-; RV32I-NEXT: srli t3, t1, 16
-; RV32I-NEXT: sb t3, 10(a2)
-; RV32I-NEXT: or t3, t1, t4
-; RV32I-NEXT: srli t1, t1, 8
-; RV32I-NEXT: sb t1, 9(a2)
-; RV32I-NEXT: srli t1, t0, 24
-; RV32I-NEXT: sb t1, 15(a2)
-; RV32I-NEXT: srli t1, t0, 16
-; RV32I-NEXT: sb t1, 14(a2)
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: srli t0, t0, 8
-; RV32I-NEXT: sb t0, 13(a2)
-; RV32I-NEXT: srli t0, a1, 24
-; RV32I-NEXT: sb t0, 3(a2)
-; RV32I-NEXT: srli t0, a1, 16
-; RV32I-NEXT: sb t0, 2(a2)
-; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: srli a4, t3, 8
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, t1, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, t1, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli a4, t1, 8
+; RV32I-NEXT: sb a4, 21(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 9(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 14(a2)
; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 1(a2)
-; RV32I-NEXT: srli a1, a3, 24
+; RV32I-NEXT: sb a1, 13(a2)
+; RV32I-NEXT: srli a1, a0, 24
; RV32I-NEXT: sb a1, 7(a2)
-; RV32I-NEXT: srli a1, a3, 16
+; RV32I-NEXT: srli a1, a0, 16
; RV32I-NEXT: sb a1, 6(a2)
-; RV32I-NEXT: or a0, a3, a0
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 5(a2)
-; RV32I-NEXT: sb a6, 24(a2)
-; RV32I-NEXT: sb a4, 28(a2)
-; RV32I-NEXT: sb a7, 16(a2)
-; RV32I-NEXT: sb t2, 20(a2)
-; RV32I-NEXT: sb t3, 8(a2)
-; RV32I-NEXT: sb a5, 12(a2)
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2762,200 +2147,43 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: ashr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 31(a0)
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -64
; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t3, 6(a0)
-; RV64I-NEXT: lbu t4, 7(a0)
-; RV64I-NEXT: lbu t5, 8(a0)
-; RV64I-NEXT: lbu t6, 9(a0)
-; RV64I-NEXT: lbu s0, 10(a0)
-; RV64I-NEXT: lbu s1, 11(a0)
-; RV64I-NEXT: lbu s2, 12(a0)
-; RV64I-NEXT: lbu s3, 13(a0)
-; RV64I-NEXT: lbu s4, 14(a0)
-; RV64I-NEXT: lbu s5, 15(a0)
-; RV64I-NEXT: lbu s6, 16(a0)
-; RV64I-NEXT: lbu s7, 17(a0)
-; RV64I-NEXT: lbu s8, 18(a0)
-; RV64I-NEXT: lbu s9, 19(a0)
-; RV64I-NEXT: lbu a3, 1(a1)
-; RV64I-NEXT: lbu s10, 0(a1)
-; RV64I-NEXT: lbu s11, 2(a1)
-; RV64I-NEXT: lbu ra, 3(a1)
-; RV64I-NEXT: slli a3, a3, 8
-; RV64I-NEXT: or a3, a3, s10
-; RV64I-NEXT: slli s11, s11, 16
-; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: lbu s10, 5(a1)
-; RV64I-NEXT: or s11, ra, s11
-; RV64I-NEXT: or a3, s11, a3
-; RV64I-NEXT: lbu s11, 4(a1)
-; RV64I-NEXT: slli s10, s10, 8
-; RV64I-NEXT: lbu ra, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or s10, s10, s11
-; RV64I-NEXT: lbu s11, 20(a0)
-; RV64I-NEXT: slli ra, ra, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, ra
-; RV64I-NEXT: lbu ra, 21(a0)
-; RV64I-NEXT: or a1, a1, s10
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or t2, a1, a3
-; RV64I-NEXT: lbu t0, 23(a0)
-; RV64I-NEXT: lbu a7, 24(a0)
-; RV64I-NEXT: lbu a6, 25(a0)
-; RV64I-NEXT: lbu a5, 26(a0)
-; RV64I-NEXT: lbu a1, 30(a0)
-; RV64I-NEXT: lbu a3, 29(a0)
-; RV64I-NEXT: lbu a4, 28(a0)
-; RV64I-NEXT: lbu a0, 27(a0)
-; RV64I-NEXT: sb a1, 86(sp)
-; RV64I-NEXT: sb a3, 85(sp)
-; RV64I-NEXT: sb a4, 84(sp)
-; RV64I-NEXT: sb a0, 83(sp)
-; RV64I-NEXT: sb a5, 82(sp)
-; RV64I-NEXT: sb a6, 81(sp)
-; RV64I-NEXT: sb a7, 80(sp)
-; RV64I-NEXT: sb t0, 79(sp)
-; RV64I-NEXT: sb s10, 78(sp)
-; RV64I-NEXT: sb ra, 77(sp)
-; RV64I-NEXT: sb s11, 76(sp)
-; RV64I-NEXT: sb s9, 75(sp)
-; RV64I-NEXT: sb s8, 74(sp)
-; RV64I-NEXT: sb s7, 73(sp)
-; RV64I-NEXT: sb s6, 72(sp)
-; RV64I-NEXT: sb s5, 71(sp)
-; RV64I-NEXT: sb s4, 70(sp)
-; RV64I-NEXT: sb s3, 69(sp)
-; RV64I-NEXT: sb s2, 68(sp)
-; RV64I-NEXT: sb s1, 67(sp)
-; RV64I-NEXT: sb s0, 66(sp)
-; RV64I-NEXT: sb t6, 65(sp)
-; RV64I-NEXT: sb t5, 64(sp)
-; RV64I-NEXT: sb t1, 87(sp)
-; RV64I-NEXT: slli t1, t1, 56
-; RV64I-NEXT: sb t4, 63(sp)
-; RV64I-NEXT: sb t3, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: srai a0, t1, 63
-; RV64I-NEXT: sb a0, 112(sp)
-; RV64I-NEXT: sb a0, 104(sp)
-; RV64I-NEXT: sb a0, 96(sp)
-; RV64I-NEXT: sb a0, 88(sp)
-; RV64I-NEXT: srli a1, a0, 56
-; RV64I-NEXT: sb a1, 119(sp)
-; RV64I-NEXT: srli a3, a0, 48
-; RV64I-NEXT: sb a3, 118(sp)
-; RV64I-NEXT: srli a4, a0, 40
-; RV64I-NEXT: sb a4, 117(sp)
-; RV64I-NEXT: srli a5, a0, 32
-; RV64I-NEXT: sb a5, 116(sp)
-; RV64I-NEXT: srli a6, a0, 24
-; RV64I-NEXT: sb a6, 115(sp)
-; RV64I-NEXT: srli a7, a0, 16
-; RV64I-NEXT: sb a7, 114(sp)
-; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: sb a0, 113(sp)
-; RV64I-NEXT: sb a1, 111(sp)
-; RV64I-NEXT: sb a3, 110(sp)
-; RV64I-NEXT: sb a4, 109(sp)
-; RV64I-NEXT: sb a5, 108(sp)
-; RV64I-NEXT: sb a6, 107(sp)
-; RV64I-NEXT: sb a7, 106(sp)
-; RV64I-NEXT: sb a0, 105(sp)
-; RV64I-NEXT: sb a1, 103(sp)
-; RV64I-NEXT: sb a3, 102(sp)
-; RV64I-NEXT: sb a4, 101(sp)
-; RV64I-NEXT: sb a5, 100(sp)
-; RV64I-NEXT: sb a6, 99(sp)
-; RV64I-NEXT: sb a7, 98(sp)
-; RV64I-NEXT: sb a0, 97(sp)
-; RV64I-NEXT: sb a1, 95(sp)
-; RV64I-NEXT: sb a3, 94(sp)
-; RV64I-NEXT: sb a4, 93(sp)
-; RV64I-NEXT: sb a5, 92(sp)
-; RV64I-NEXT: sb a6, 91(sp)
-; RV64I-NEXT: sb a7, 90(sp)
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: slli a0, t2, 56
-; RV64I-NEXT: srli a0, a0, 59
-; RV64I-NEXT: addi a1, sp, 56
-; RV64I-NEXT: add a1, a1, a0
-; RV64I-NEXT: lbu a0, 9(a1)
-; RV64I-NEXT: lbu a3, 8(a1)
-; RV64I-NEXT: lbu a4, 10(a1)
-; RV64I-NEXT: lbu a5, 11(a1)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a3
-; RV64I-NEXT: slli a4, a4, 16
-; RV64I-NEXT: slli a5, a5, 24
-; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: or a0, a4, a0
-; RV64I-NEXT: lbu a3, 13(a1)
-; RV64I-NEXT: lbu a4, 12(a1)
-; RV64I-NEXT: lbu a5, 14(a1)
-; RV64I-NEXT: lbu a6, 15(a1)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a4, a3, a0
-; RV64I-NEXT: andi a3, t2, 7
-; RV64I-NEXT: lbu a0, 17(a1)
-; RV64I-NEXT: lbu a5, 16(a1)
-; RV64I-NEXT: lbu a6, 18(a1)
-; RV64I-NEXT: lbu a7, 19(a1)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
-; RV64I-NEXT: or a0, a5, a0
-; RV64I-NEXT: lbu a5, 21(a1)
-; RV64I-NEXT: lbu a6, 20(a1)
-; RV64I-NEXT: lbu a7, 22(a1)
-; RV64I-NEXT: lbu t0, 23(a1)
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -2963,467 +2191,378 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
-; RV64I-NEXT: or a5, a5, a0
-; RV64I-NEXT: slli a0, a5, 1
-; RV64I-NEXT: not a6, a3
-; RV64I-NEXT: sll a0, a0, a6
-; RV64I-NEXT: lbu a6, 1(a1)
-; RV64I-NEXT: lbu a7, 0(a1)
-; RV64I-NEXT: lbu t0, 2(a1)
-; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli t1, t1, 24
; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 5(a1)
-; RV64I-NEXT: lbu t0, 4(a1)
-; RV64I-NEXT: lbu t1, 6(a1)
-; RV64I-NEXT: lbu t2, 7(a1)
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or t0, t2, t1
-; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a7, a0, 32
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 25(a1)
-; RV64I-NEXT: lbu t0, 24(a1)
-; RV64I-NEXT: lbu t1, 26(a1)
-; RV64I-NEXT: lbu t2, 27(a1)
+; RV64I-NEXT: lbu a7, 1(a1)
+; RV64I-NEXT: lbu t0, 0(a1)
+; RV64I-NEXT: lbu t1, 2(a1)
+; RV64I-NEXT: lbu t2, 3(a1)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli t2, t2, 24
; RV64I-NEXT: or t0, t2, t1
; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: lbu t0, 29(a1)
-; RV64I-NEXT: lbu t1, 28(a1)
-; RV64I-NEXT: lbu t2, 30(a1)
-; RV64I-NEXT: lbu a1, 31(a1)
+; RV64I-NEXT: lbu t0, 5(a1)
+; RV64I-NEXT: lbu t1, 4(a1)
+; RV64I-NEXT: lbu t2, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: or t0, t0, t1
; RV64I-NEXT: slli t2, t2, 16
; RV64I-NEXT: slli a1, a1, 24
; RV64I-NEXT: or a1, a1, t2
-; RV64I-NEXT: slli t1, a4, 1
; RV64I-NEXT: or a1, a1, t0
-; RV64I-NEXT: xori t0, a3, 63
-; RV64I-NEXT: sll t1, t1, t0
; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a7, a1, a7
-; RV64I-NEXT: slli a1, a7, 1
-; RV64I-NEXT: sll t0, a1, t0
-; RV64I-NEXT: srl a1, a4, a3
-; RV64I-NEXT: srl a4, a6, a3
-; RV64I-NEXT: srl a5, a5, a3
-; RV64I-NEXT: sra a3, a7, a3
-; RV64I-NEXT: srli a6, a5, 48
-; RV64I-NEXT: sb a6, 22(a2)
-; RV64I-NEXT: srli a6, a5, 40
-; RV64I-NEXT: sb a6, 21(a2)
-; RV64I-NEXT: srli a6, a5, 32
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: srli a6, a5, 24
-; RV64I-NEXT: sb a6, 19(a2)
-; RV64I-NEXT: srli a6, a5, 16
-; RV64I-NEXT: sb a6, 18(a2)
-; RV64I-NEXT: or a6, a5, t0
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: sraiw a0, a0, 31
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a0, 40(sp)
+; RV64I-NEXT: sd a0, 32(sp)
+; RV64I-NEXT: sd a6, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: srl a0, a4, a1
+; RV64I-NEXT: ld a5, 16(a3)
+; RV64I-NEXT: andi a6, a1, 63
+; RV64I-NEXT: xori a6, a6, 63
+; RV64I-NEXT: ld a7, 0(a3)
+; RV64I-NEXT: slli t0, a5, 1
+; RV64I-NEXT: sll t0, t0, a6
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: srl a7, a7, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a6
+; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: srl a5, a5, a1
+; RV64I-NEXT: slli a7, a3, 1
+; RV64I-NEXT: sll a6, a7, a6
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: sra a1, a3, a1
+; RV64I-NEXT: sb a1, 24(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 30(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 28(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 27(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 26(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 25(a2)
; RV64I-NEXT: sb a5, 16(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a1, a5, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a5, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a5, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a5, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a5, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a5, 16
+; RV64I-NEXT: sb a1, 18(a2)
; RV64I-NEXT: srli a5, a5, 8
; RV64I-NEXT: sb a5, 17(a2)
-; RV64I-NEXT: srli a5, a3, 56
-; RV64I-NEXT: sb a5, 31(a2)
-; RV64I-NEXT: srli a5, a3, 48
-; RV64I-NEXT: sb a5, 30(a2)
-; RV64I-NEXT: srli a5, a3, 40
-; RV64I-NEXT: sb a5, 29(a2)
-; RV64I-NEXT: srli a5, a3, 32
-; RV64I-NEXT: sb a5, 28(a2)
-; RV64I-NEXT: srli a5, a3, 24
-; RV64I-NEXT: sb a5, 27(a2)
-; RV64I-NEXT: srli a5, a3, 16
-; RV64I-NEXT: sb a5, 26(a2)
-; RV64I-NEXT: sb a3, 24(a2)
-; RV64I-NEXT: srli a3, a3, 8
-; RV64I-NEXT: sb a3, 25(a2)
-; RV64I-NEXT: srli a3, a4, 48
-; RV64I-NEXT: sb a3, 6(a2)
-; RV64I-NEXT: srli a3, a4, 40
-; RV64I-NEXT: sb a3, 5(a2)
-; RV64I-NEXT: srli a3, a4, 32
-; RV64I-NEXT: sb a3, 4(a2)
-; RV64I-NEXT: srli a3, a4, 24
-; RV64I-NEXT: sb a3, 3(a2)
-; RV64I-NEXT: srli a3, a4, 16
-; RV64I-NEXT: sb a3, 2(a2)
-; RV64I-NEXT: or a3, a4, t1
-; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
; RV64I-NEXT: srli a4, a4, 8
; RV64I-NEXT: sb a4, 1(a2)
-; RV64I-NEXT: srli a4, a1, 48
-; RV64I-NEXT: sb a4, 14(a2)
-; RV64I-NEXT: srli a4, a1, 40
-; RV64I-NEXT: sb a4, 13(a2)
-; RV64I-NEXT: srli a4, a1, 32
-; RV64I-NEXT: sb a4, 12(a2)
-; RV64I-NEXT: srli a4, a1, 24
-; RV64I-NEXT: sb a4, 11(a2)
-; RV64I-NEXT: srli a4, a1, 16
-; RV64I-NEXT: sb a4, 10(a2)
-; RV64I-NEXT: or a0, a1, a0
-; RV64I-NEXT: sb a1, 8(a2)
-; RV64I-NEXT: srli a1, a1, 8
-; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: srli a1, a6, 56
-; RV64I-NEXT: sb a1, 23(a2)
-; RV64I-NEXT: srli a3, a3, 56
-; RV64I-NEXT: sb a3, 7(a2)
-; RV64I-NEXT: srli a0, a0, 56
-; RV64I-NEXT: sb a0, 15(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
; RV32I-LABEL: ashr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t3, 31(a0)
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -64
; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t2, 6(a0)
-; RV32I-NEXT: lbu t4, 7(a0)
-; RV32I-NEXT: lbu t5, 8(a0)
-; RV32I-NEXT: lbu t6, 9(a0)
-; RV32I-NEXT: lbu s0, 10(a0)
-; RV32I-NEXT: lbu s1, 11(a0)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s5, 15(a0)
-; RV32I-NEXT: lbu s6, 16(a0)
-; RV32I-NEXT: lbu s7, 17(a0)
-; RV32I-NEXT: lbu s8, 18(a0)
-; RV32I-NEXT: lbu a3, 1(a1)
-; RV32I-NEXT: lbu s9, 19(a0)
-; RV32I-NEXT: lbu s10, 20(a0)
-; RV32I-NEXT: lbu s11, 0(a1)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: or a3, a3, s11
-; RV32I-NEXT: lbu s11, 21(a0)
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, ra
-; RV32I-NEXT: lbu ra, 22(a0)
-; RV32I-NEXT: or t1, a1, a3
-; RV32I-NEXT: lbu t0, 23(a0)
-; RV32I-NEXT: lbu a7, 24(a0)
-; RV32I-NEXT: lbu a6, 25(a0)
-; RV32I-NEXT: lbu a5, 26(a0)
-; RV32I-NEXT: lbu a1, 30(a0)
-; RV32I-NEXT: lbu a3, 29(a0)
-; RV32I-NEXT: lbu a4, 28(a0)
-; RV32I-NEXT: lbu a0, 27(a0)
-; RV32I-NEXT: sb a1, 58(sp)
-; RV32I-NEXT: sb a3, 57(sp)
-; RV32I-NEXT: sb a4, 56(sp)
-; RV32I-NEXT: sb a0, 55(sp)
-; RV32I-NEXT: sb a5, 54(sp)
-; RV32I-NEXT: sb a6, 53(sp)
-; RV32I-NEXT: sb a7, 52(sp)
-; RV32I-NEXT: sb t0, 51(sp)
-; RV32I-NEXT: sb ra, 50(sp)
-; RV32I-NEXT: sb s11, 49(sp)
-; RV32I-NEXT: sb s10, 48(sp)
-; RV32I-NEXT: sb s9, 47(sp)
-; RV32I-NEXT: sb s8, 46(sp)
-; RV32I-NEXT: sb s7, 45(sp)
-; RV32I-NEXT: sb s6, 44(sp)
-; RV32I-NEXT: sb s5, 43(sp)
-; RV32I-NEXT: sb t3, 59(sp)
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: sb s4, 42(sp)
-; RV32I-NEXT: sb s3, 41(sp)
-; RV32I-NEXT: sb s2, 40(sp)
-; RV32I-NEXT: sb s1, 39(sp)
-; RV32I-NEXT: sb s0, 38(sp)
-; RV32I-NEXT: sb t6, 37(sp)
-; RV32I-NEXT: sb t5, 36(sp)
-; RV32I-NEXT: sb t4, 35(sp)
-; RV32I-NEXT: sb t2, 34(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 33(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 32(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 30(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 29(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 28(sp)
-; RV32I-NEXT: srai a0, t3, 31
-; RV32I-NEXT: sb a0, 88(sp)
-; RV32I-NEXT: sb a0, 84(sp)
-; RV32I-NEXT: sb a0, 80(sp)
-; RV32I-NEXT: sb a0, 76(sp)
-; RV32I-NEXT: sb a0, 72(sp)
-; RV32I-NEXT: sb a0, 68(sp)
-; RV32I-NEXT: sb a0, 64(sp)
-; RV32I-NEXT: sb a0, 60(sp)
-; RV32I-NEXT: srli a1, a0, 24
-; RV32I-NEXT: sb a1, 91(sp)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 90(sp)
-; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 89(sp)
-; RV32I-NEXT: sb a1, 87(sp)
-; RV32I-NEXT: sb a3, 86(sp)
-; RV32I-NEXT: sb a0, 85(sp)
-; RV32I-NEXT: sb a1, 83(sp)
-; RV32I-NEXT: sb a3, 82(sp)
-; RV32I-NEXT: sb a0, 81(sp)
-; RV32I-NEXT: sb a1, 79(sp)
-; RV32I-NEXT: sb a3, 78(sp)
-; RV32I-NEXT: sb a0, 77(sp)
-; RV32I-NEXT: sb a1, 75(sp)
-; RV32I-NEXT: sb a3, 74(sp)
-; RV32I-NEXT: sb a0, 73(sp)
-; RV32I-NEXT: sb a1, 71(sp)
-; RV32I-NEXT: sb a3, 70(sp)
-; RV32I-NEXT: sb a0, 69(sp)
-; RV32I-NEXT: sb a1, 67(sp)
-; RV32I-NEXT: sb a3, 66(sp)
-; RV32I-NEXT: sb a0, 65(sp)
-; RV32I-NEXT: sb a1, 63(sp)
-; RV32I-NEXT: sb a3, 62(sp)
-; RV32I-NEXT: sb a0, 61(sp)
-; RV32I-NEXT: slli a0, t1, 24
-; RV32I-NEXT: srli a0, a0, 27
-; RV32I-NEXT: addi a4, sp, 28
-; RV32I-NEXT: add a4, a4, a0
-; RV32I-NEXT: lbu a0, 5(a4)
-; RV32I-NEXT: lbu a1, 4(a4)
-; RV32I-NEXT: lbu a3, 6(a4)
-; RV32I-NEXT: lbu a5, 7(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: slli a3, a3, 16
-; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or t5, a3, a0
-; RV32I-NEXT: andi a3, t1, 7
-; RV32I-NEXT: lbu a0, 9(a4)
-; RV32I-NEXT: lbu a1, 8(a4)
-; RV32I-NEXT: lbu a5, 10(a4)
-; RV32I-NEXT: lbu a6, 11(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a1, a6, a5
-; RV32I-NEXT: or a6, a1, a0
-; RV32I-NEXT: slli a0, a6, 1
-; RV32I-NEXT: not t1, a3
-; RV32I-NEXT: sll a0, a0, t1
-; RV32I-NEXT: lbu a1, 1(a4)
-; RV32I-NEXT: lbu a5, 0(a4)
-; RV32I-NEXT: lbu a7, 2(a4)
-; RV32I-NEXT: lbu t0, 3(a4)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a5
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or t0, a5, a1
-; RV32I-NEXT: slli a1, t5, 1
-; RV32I-NEXT: xori t2, a3, 31
-; RV32I-NEXT: sll a1, a1, t2
-; RV32I-NEXT: lbu a5, 13(a4)
-; RV32I-NEXT: lbu a7, 12(a4)
-; RV32I-NEXT: lbu t3, 14(a4)
-; RV32I-NEXT: lbu t4, 15(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t1, t1, a7
+; RV32I-NEXT: lbu a7, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t2
; RV32I-NEXT: slli t3, t3, 16
; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t3, a7, a5
-; RV32I-NEXT: lbu a5, 17(a4)
-; RV32I-NEXT: lbu a7, 16(a4)
-; RV32I-NEXT: lbu t4, 18(a4)
-; RV32I-NEXT: lbu t6, 19(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t2, t2, a7
+; RV32I-NEXT: lbu a7, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a7, t6, t4
-; RV32I-NEXT: or t4, a7, a5
-; RV32I-NEXT: slli a5, t4, 1
-; RV32I-NEXT: sll a7, a5, t1
-; RV32I-NEXT: lbu a5, 21(a4)
-; RV32I-NEXT: lbu t6, 20(a4)
-; RV32I-NEXT: lbu s0, 22(a4)
-; RV32I-NEXT: lbu s1, 23(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, t6
-; RV32I-NEXT: slli s0, s0, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or s0, s0, a5
-; RV32I-NEXT: lbu a5, 25(a4)
-; RV32I-NEXT: lbu t6, 24(a4)
-; RV32I-NEXT: lbu s1, 26(a4)
-; RV32I-NEXT: lbu s2, 27(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, t6
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or t6, s2, s1
-; RV32I-NEXT: or t6, t6, a5
-; RV32I-NEXT: lbu a5, 29(a4)
-; RV32I-NEXT: lbu s1, 28(a4)
-; RV32I-NEXT: slli s2, t6, 1
-; RV32I-NEXT: sll t1, s2, t1
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, s1
-; RV32I-NEXT: lbu s1, 30(a4)
-; RV32I-NEXT: lbu a4, 31(a4)
-; RV32I-NEXT: slli s2, t3, 1
-; RV32I-NEXT: sll s2, s2, t2
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: slli s1, s0, 1
-; RV32I-NEXT: sll s1, s1, t2
-; RV32I-NEXT: or s3, a4, a5
-; RV32I-NEXT: slli a4, s3, 1
-; RV32I-NEXT: sll t2, a4, t2
-; RV32I-NEXT: srl a4, t5, a3
-; RV32I-NEXT: srl a5, t0, a3
-; RV32I-NEXT: srl t0, t3, a3
-; RV32I-NEXT: srl a6, a6, a3
-; RV32I-NEXT: srl t3, s0, a3
-; RV32I-NEXT: srl t4, t4, a3
-; RV32I-NEXT: srl t5, t6, a3
-; RV32I-NEXT: sra a3, s3, a3
-; RV32I-NEXT: srli t6, t5, 16
-; RV32I-NEXT: sb t6, 26(a2)
-; RV32I-NEXT: or t2, t5, t2
-; RV32I-NEXT: sb t5, 24(a2)
-; RV32I-NEXT: srli t5, t5, 8
-; RV32I-NEXT: sb t5, 25(a2)
-; RV32I-NEXT: srli t5, a3, 24
-; RV32I-NEXT: sb t5, 31(a2)
-; RV32I-NEXT: srli t5, a3, 16
-; RV32I-NEXT: sb t5, 30(a2)
-; RV32I-NEXT: sb a3, 28(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 29(a2)
-; RV32I-NEXT: srli a3, t4, 16
-; RV32I-NEXT: sb a3, 18(a2)
-; RV32I-NEXT: or a3, t4, s1
-; RV32I-NEXT: sb t4, 16(a2)
-; RV32I-NEXT: srli t4, t4, 8
-; RV32I-NEXT: sb t4, 17(a2)
-; RV32I-NEXT: srli t4, t3, 16
-; RV32I-NEXT: sb t4, 22(a2)
-; RV32I-NEXT: or t1, t3, t1
-; RV32I-NEXT: sb t3, 20(a2)
-; RV32I-NEXT: srli t3, t3, 8
-; RV32I-NEXT: sb t3, 21(a2)
-; RV32I-NEXT: srli t3, a6, 16
-; RV32I-NEXT: sb t3, 10(a2)
-; RV32I-NEXT: or t3, a6, s2
-; RV32I-NEXT: sb a6, 8(a2)
-; RV32I-NEXT: srli a6, a6, 8
-; RV32I-NEXT: sb a6, 9(a2)
-; RV32I-NEXT: srli a6, t0, 16
-; RV32I-NEXT: sb a6, 14(a2)
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: sb t0, 12(a2)
-; RV32I-NEXT: srli a7, t0, 8
-; RV32I-NEXT: sb a7, 13(a2)
-; RV32I-NEXT: srli a7, a5, 16
-; RV32I-NEXT: sb a7, 2(a2)
-; RV32I-NEXT: or a1, a5, a1
-; RV32I-NEXT: sb a5, 0(a2)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 1(a2)
-; RV32I-NEXT: srli a5, a4, 16
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: or a0, a4, a0
-; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, a0, t4
+; RV32I-NEXT: or t3, t3, a7
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t4, 0(a1)
+; RV32I-NEXT: lbu t5, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t4
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t5
+; RV32I-NEXT: or a7, a1, a7
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a0, 56(sp)
+; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 44(sp)
+; RV32I-NEXT: sw a0, 40(sp)
+; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw t3, 28(sp)
+; RV32I-NEXT: sw t2, 24(sp)
+; RV32I-NEXT: sw t1, 20(sp)
+; RV32I-NEXT: sw t0, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a0, a7, 3
+; RV32I-NEXT: andi a0, a0, 28
+; RV32I-NEXT: mv a1, sp
+; RV32I-NEXT: add a4, a1, a0
+; RV32I-NEXT: lw a1, 4(a4)
+; RV32I-NEXT: srl a0, a1, a7
+; RV32I-NEXT: lw a5, 8(a4)
+; RV32I-NEXT: andi a3, a7, 31
+; RV32I-NEXT: xori a6, a3, 31
+; RV32I-NEXT: lw a3, 0(a4)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: srl a3, a3, a7
+; RV32I-NEXT: slli a1, a1, 1
+; RV32I-NEXT: lw t0, 12(a4)
+; RV32I-NEXT: lw t1, 16(a4)
+; RV32I-NEXT: sll a1, a1, a6
+; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: srl a3, t0, a7
+; RV32I-NEXT: slli t2, t1, 1
+; RV32I-NEXT: sll t2, t2, a6
+; RV32I-NEXT: or a3, a3, t2
+; RV32I-NEXT: srl a5, a5, a7
+; RV32I-NEXT: slli t0, t0, 1
+; RV32I-NEXT: lw t2, 20(a4)
+; RV32I-NEXT: lw t3, 24(a4)
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a5, a5, t0
+; RV32I-NEXT: srl t0, t2, a7
+; RV32I-NEXT: slli t4, t3, 1
+; RV32I-NEXT: sll t4, t4, a6
+; RV32I-NEXT: or t0, t0, t4
+; RV32I-NEXT: srl t1, t1, a7
+; RV32I-NEXT: slli t2, t2, 1
+; RV32I-NEXT: lw a4, 28(a4)
+; RV32I-NEXT: sll t2, t2, a6
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: srl t2, t3, a7
+; RV32I-NEXT: slli t3, a4, 1
+; RV32I-NEXT: sll a6, t3, a6
+; RV32I-NEXT: or a6, t2, a6
+; RV32I-NEXT: sra a4, a4, a7
+; RV32I-NEXT: sb a4, 28(a2)
+; RV32I-NEXT: srli a7, a4, 24
+; RV32I-NEXT: sb a7, 31(a2)
+; RV32I-NEXT: srli a7, a4, 16
+; RV32I-NEXT: sb a7, 30(a2)
; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 5(a2)
-; RV32I-NEXT: srli a4, t2, 24
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb t1, 16(a2)
+; RV32I-NEXT: sb t0, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a3, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a4, a6, 24
; RV32I-NEXT: sb a4, 27(a2)
-; RV32I-NEXT: srli a3, a3, 24
-; RV32I-NEXT: sb a3, 19(a2)
-; RV32I-NEXT: srli a3, t1, 24
-; RV32I-NEXT: sb a3, 23(a2)
-; RV32I-NEXT: srli a3, t3, 24
-; RV32I-NEXT: sb a3, 11(a2)
-; RV32I-NEXT: srli a3, a6, 24
-; RV32I-NEXT: sb a3, 15(a2)
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: sb a1, 3(a2)
-; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: srli a4, a6, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, a6, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, t1, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t1, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t1, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 21(a2)
+; RV32I-NEXT: srli a4, a5, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 15(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 14(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 1d3b015f3c5479..c350ed64280dd2 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -174,22 +174,23 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-LABEL: scalar_i128:
; X86: # %bb.0: # %_udiv-special-cases
; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $156, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $176, %esp
+; X86-NEXT: movl 20(%ebp), %edx
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: xorl %eax, %esi
-; X86-NEXT: movl %esi, %edi
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: xorl %eax, %edx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: movl 16(%ebp), %edx
; X86-NEXT: xorl %eax, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl 12(%ebp), %ecx
; X86-NEXT: xorl %eax, %ecx
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -198,32 +199,33 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: sbbl %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%ebp), %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: sarl $31, %edx
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: xorl %edx, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl 36(%ebp), %ecx
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: movl 32(%ebp), %ebx
; X86-NEXT: xorl %edx, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: xorl %edx, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl 28(%ebp), %edi
; X86-NEXT: xorl %edx, %edi
; X86-NEXT: subl %edx, %edi
-; X86-NEXT: sbbl %edx, %ebp
; X86-NEXT: sbbl %edx, %ebx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %edx, %esi
; X86-NEXT: xorl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: sete %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: orl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %edx
@@ -232,359 +234,357 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: bsrl %eax, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %esi, %esi
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: bsrl %ebp, %edx
+; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: bsrl %edi, %edi
; X86-NEXT: xorl $31, %edi
; X86-NEXT: orl $32, %edi
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: testl %ebp, %ebp
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %edi
; X86-NEXT: orl $64, %edi
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %esi, %edx
; X86-NEXT: cmovnel %ecx, %edi
-; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: bsrl %eax, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: bsrl %ebp, %ecx
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: testl %eax, %eax
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: bsrl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: bsrl %ebx, %esi
; X86-NEXT: xorl $31, %esi
; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %edx
; X86-NEXT: orl $32, %edx
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %esi, %edx
; X86-NEXT: orl $64, %edx
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %eax, %esi
; X86-NEXT: cmovnel %ecx, %edx
; X86-NEXT: xorl %ebx, %ebx
; X86-NEXT: subl %edx, %edi
; X86-NEXT: movl $0, %edx
; X86-NEXT: sbbl %edx, %edx
-; X86-NEXT: movl $0, %eax
-; X86-NEXT: sbbl %eax, %eax
; X86-NEXT: movl $0, %esi
; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
; X86-NEXT: movl $127, %ecx
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: cmpl %edi, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %ecx
-; X86-NEXT: movl $0, %ecx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: setb %cl
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-NEXT: cmovnel %ebx, %edx
-; X86-NEXT: cmovnel %ebx, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: cmovnel %ebx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovnel %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: cmovnel %ebx, %eax
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: jne .LBB4_8
-; X86-NEXT: # %bb.1: # %_udiv-special-cases
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: xorl $127, %edi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: jne .LBB4_1
+; X86-NEXT: # %bb.8: # %_udiv-special-cases
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: xorl $127, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %ecx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %edi, %ecx
-; X86-NEXT: je .LBB4_8
-; X86-NEXT: # %bb.2: # %udiv-bb1
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: je .LBB4_9
+; X86-NEXT: # %bb.5: # %udiv-bb1
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: xorps %xmm0, %xmm0
+; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: xorb $127, %al
-; X86-NEXT: movb %al, %ch
-; X86-NEXT: andb $7, %ch
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $15, %al
+; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %edi
-; X86-NEXT: movl 148(%esp,%edi), %edx
-; X86-NEXT: movl 152(%esp,%edi), %esi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: notb %cl
-; X86-NEXT: movl 144(%esp,%edi), %eax
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: shrl %ebp
-; X86-NEXT: shrl %cl, %ebp
-; X86-NEXT: orl %edx, %ebp
-; X86-NEXT: movl 140(%esp,%edi), %edx
-; X86-NEXT: movb %ch, %cl
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 152(%esp,%eax), %esi
+; X86-NEXT: movl 156(%esp,%eax), %edx
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 144(%esp,%eax), %edx
+; X86-NEXT: movl 148(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %esi
; X86-NEXT: shldl %cl, %edx, %eax
; X86-NEXT: shll %cl, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: jae .LBB4_3
+; X86-NEXT: jae .LBB4_2
; X86-NEXT: # %bb.6:
-; X86-NEXT: xorl %edi, %edi
; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: jmp .LBB4_7
-; X86-NEXT: .LBB4_3: # %udiv-preheader
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: movl %esi, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jmp .LBB4_7
+; X86-NEXT: .LBB4_1:
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: jmp .LBB4_9
+; X86-NEXT: .LBB4_2: # %udiv-preheader
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 108(%esp,%eax), %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %dl, %ch
-; X86-NEXT: andb $7, %ch
-; X86-NEXT: movb %dl, %cl
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %edx
-; X86-NEXT: movl 104(%esp,%edx), %ebx
-; X86-NEXT: movl 100(%esp,%edx), %edi
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %ebx, %ebp
-; X86-NEXT: movl 92(%esp,%edx), %esi
+; X86-NEXT: movl 104(%esp,%eax), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shrdl %cl, %edx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 96(%esp,%edx), %esi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: shrl %cl, %edx
-; X86-NEXT: notb %cl
-; X86-NEXT: addl %edi, %edi
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl 96(%esp,%eax), %esi
+; X86-NEXT: movl 100(%esp,%eax), %eax
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrl %cl, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: shrdl %cl, %ebx, %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $-1, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %edx, %edx
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: .p2align 4, 0x90
-; X86-NEXT: .LBB4_4: # %udiv-do-while
+; X86-NEXT: .LBB4_3: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: shldl $1, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebp, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %ebp
+; X86-NEXT: shldl $1, %ebx, %edx
+; X86-NEXT: shldl $1, %ecx, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %edx
+; X86-NEXT: shldl $1, %edi, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: shldl $1, %ecx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: orl %esi, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ecx
; X86-NEXT: orl %esi, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %edi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebp, %ecx
+; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl $1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: andl $1, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %edi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: subl %ecx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT: sbbl %edi, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: sbbl %esi, %ebx
+; X86-NEXT: subl %ecx, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: adcl $-1, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: adcl $-1, %esi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %edi
+; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %edi, %ecx
-; X86-NEXT: jne .LBB4_4
-; X86-NEXT: # %bb.5:
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %eax, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB4_3
+; X86-NEXT: # %bb.4:
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: .LBB4_7: # %udiv-loop-exit
-; X86-NEXT: shldl $1, %ebp, %edx
+; X86-NEXT: shldl $1, %ebx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: shldl $1, %eax, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: shldl $1, %edi, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: addl %edi, %edx
; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: shldl $1, %eax, %ebp
-; X86-NEXT: orl %ecx, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl $1, %esi, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: addl %esi, %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: .LBB4_8: # %udiv-end
+; X86-NEXT: .LBB4_9: # %udiv-end
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, %edx
-; X86-NEXT: xorl %ecx, %ebp
-; X86-NEXT: xorl %ecx, %eax
; X86-NEXT: xorl %ecx, %esi
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %ecx, %ebx
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: xorl %ecx, %edx
+; X86-NEXT: subl %ecx, %edx
; X86-NEXT: sbbl %ecx, %eax
+; X86-NEXT: sbbl %ecx, %ebx
+; X86-NEXT: sbbl %ecx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%ebp), %ecx
+; X86-NEXT: movl %edx, (%ecx)
+; X86-NEXT: movl %eax, 4(%ecx)
+; X86-NEXT: movl %ebx, 8(%ecx)
+; X86-NEXT: movl %esi, 12(%ecx)
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %ecx, %ebp
-; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %esi, (%ecx)
-; X86-NEXT: movl %eax, 4(%ecx)
-; X86-NEXT: movl %ebp, 8(%ecx)
-; X86-NEXT: movl %edx, 12(%ecx)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ebp, %edi
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: mull %ebp
+; X86-NEXT: movl 32(%ebp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ebx, %edx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: imull %eax, %ebx
; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: imull %ebp, %edi
+; X86-NEXT: imull %esi, %edi
; X86-NEXT: addl %edx, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %esi, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 40(%ebp), %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: imull %edx, %esi
+; X86-NEXT: imull %edx, %ebx
; X86-NEXT: mull %edx
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: subl (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: sbbl %eax, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: sbbl %esi, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ebx, 8(%eax)
-; X86-NEXT: movl %edi, 12(%eax)
-; X86-NEXT: addl $156, %esp
+; X86-NEXT: adcl %edi, %ebx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl 12(%ebp), %edx
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl 20(%ebp), %edi
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: sbbl %ebx, %esi
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 58ea70e58028f1..16dc1d6b446cf7 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -174,379 +174,370 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-LABEL: scalar_i128:
; X86: # %bb.0: # %_udiv-special-cases
; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $136, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $160, %esp
+; X86-NEXT: movl 28(%ebp), %ebx
+; X86-NEXT: movl 40(%ebp), %esi
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: orl 36(%ebp), %ecx
; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: sete %bl
-; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: orl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: sete %cl
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: orl 24(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %edx
+; X86-NEXT: orl 20(%ebp), %edx
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
-; X86-NEXT: orb %bl, %al
-; X86-NEXT: movb %al, (%esp) # 1-byte Spill
-; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: orb %cl, %al
+; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: bsrl 36(%ebp), %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %edi, %edi
-; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: testl %esi, %esi
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: bsrl %eax, %edx
+; X86-NEXT: bsrl %edi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %ebp, %ebp
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: xorl $31, %ebp
-; X86-NEXT: orl $32, %ebp
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: cmovnel %edx, %ebp
-; X86-NEXT: orl $64, %ebp
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: cmovnel %ecx, %ebp
-; X86-NEXT: bsrl %esi, %edx
-; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: bsrl %ebx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: cmovnel %edx, %eax
+; X86-NEXT: orl $64, %eax
+; X86-NEXT: movl 36(%ebp), %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: movl 24(%ebp), %ebx
+; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: movl 20(%ebp), %ecx
+; X86-NEXT: bsrl %ecx, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %esi, %esi
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl 16(%ebp), %edi
; X86-NEXT: bsrl %edi, %esi
; X86-NEXT: xorl $31, %esi
-; X86-NEXT: bsrl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: bsrl 12(%ebp), %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: orl $32, %edx
; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %esi, %edx
; X86-NEXT: orl $64, %edx
-; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl 20(%ebp), %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: orl %ebx, %esi
; X86-NEXT: cmovnel %ecx, %edx
-; X86-NEXT: subl %edx, %ebp
+; X86-NEXT: subl %edx, %eax
; X86-NEXT: movl $0, %edx
; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
; X86-NEXT: movl $0, %esi
; X86-NEXT: sbbl %esi, %esi
-; X86-NEXT: movl $0, %edi
-; X86-NEXT: sbbl %edi, %edi
; X86-NEXT: movl $127, %ecx
-; X86-NEXT: cmpl %ebp, %ecx
+; X86-NEXT: cmpl %eax, %ecx
; X86-NEXT: movl $0, %ecx
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: sbbl %ebx, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: setb %cl
-; X86-NEXT: orb (%esp), %cl # 1-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: xorl $127, %eax
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %eax
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %edx
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
; X86-NEXT: testb %cl, %cl
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: movl $0, %edi
-; X86-NEXT: cmovnel %edi, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: cmovnel %edi, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: cmovnel %edi, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: cmovnel %edi, %ebx
-; X86-NEXT: orb %cl, %al
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movb %cl, %ah
+; X86-NEXT: movl 24(%ebp), %ebx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: cmovnel %esi, %ebx
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: cmovnel %esi, %ecx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%ebp), %esi
+; X86-NEXT: cmovnel %edx, %esi
+; X86-NEXT: movl 12(%ebp), %edi
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: cmovnel %edx, %ecx
+; X86-NEXT: orb %ah, %al
+; X86-NEXT: movl 44(%ebp), %eax
; X86-NEXT: jne .LBB4_7
; X86-NEXT: # %bb.1: # %udiv-bb1
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: xorps %xmm0, %xmm0
+; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 20(%ebp), %edx
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: xorb $127, %al
-; X86-NEXT: movb %al, %ch
-; X86-NEXT: andb $7, %ch
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $15, %al
+; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 128(%esp,%eax), %edx
-; X86-NEXT: movl 132(%esp,%eax), %esi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl 136(%esp,%eax), %edi
+; X86-NEXT: movl 140(%esp,%eax), %esi
+; X86-NEXT: shldl %cl, %edi, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: notb %cl
-; X86-NEXT: movl 124(%esp,%eax), %ebp
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: shrl %esi
-; X86-NEXT: shrl %cl, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl 120(%esp,%eax), %eax
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %eax, %ebp
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: addl $1, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl 128(%esp,%eax), %ebx
+; X86-NEXT: movl 132(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: shldl %cl, %ebx, %edi
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: addl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 20(%ebp), %ebx
; X86-NEXT: jae .LBB4_2
; X86-NEXT: # %bb.5:
+; X86-NEXT: xorl %edx, %edx
; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: movl %edi, %esi
; X86-NEXT: jmp .LBB4_6
; X86-NEXT: .LBB4_2: # %udiv-preheader
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebp), %edx
; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl 16(%ebp), %edx
; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 24(%ebp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movb %al, %ch
-; X86-NEXT: andb $7, %ch
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $15, %al
+; X86-NEXT: andb $12, %al
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 84(%esp,%eax), %ebx
+; X86-NEXT: movl 92(%esp,%eax), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 80(%esp,%eax), %esi
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 72(%esp,%eax), %ebp
-; X86-NEXT: movl 76(%esp,%eax), %edx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl %cl, %eax
-; X86-NEXT: notb %cl
-; X86-NEXT: addl %esi, %esi
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl 88(%esp,%eax), %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%eax), %edi
+; X86-NEXT: movl 84(%esp,%eax), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shrdl %cl, %edx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrl %cl, %ebx
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: shrdl %cl, %edx, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%ebp), %eax
; X86-NEXT: addl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 32(%ebp), %eax
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: adcl $-1, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: movl 36(%ebp), %esi
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %eax, %eax
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB4_3: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: shldl $1, %ebp, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, %ebp
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl $1, %esi, %ebx
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: shldl $1, %edi, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl $1, %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl $1, %edx, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %edx, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: shldl $1, %ecx, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %edi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebp, %ecx
+; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: sarl $31, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl 40(%ebp), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: andl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: andl 36(%ebp), %eax
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl 32(%ebp), %edx
+; X86-NEXT: andl 28(%ebp), %ecx
+; X86-NEXT: subl %ecx, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %edi, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: sbbl %eax, (%esp) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: adcl $-1, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: adcl $-1, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %ebx, %ecx
-; X86-NEXT: movl (%esp), %edi # 4-byte Reload
; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: jne .LBB4_3
; X86-NEXT: # %bb.4:
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: .LBB4_6: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %esi, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: shldl $1, %ebp, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, %ebp
-; X86-NEXT: orl %ecx, %ebp
-; X86-NEXT: addl %ebx, %ebx
+; X86-NEXT: .LBB4_6: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
; X86-NEXT: orl %eax, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shldl $1, %esi, %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl 44(%ebp), %eax
; X86-NEXT: .LBB4_7: # %udiv-end
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, (%eax)
-; X86-NEXT: movl %ebp, 4(%eax)
-; X86-NEXT: movl %esi, 8(%eax)
-; X86-NEXT: movl %edx, 12(%eax)
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%ebp), %eax
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: imull %ebp, %esi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: imull %edx, %esi
; X86-NEXT: mull %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %esi, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: imull %ecx, %ebp
-; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %esi, %edi
+; X86-NEXT: movl 40(%ebp), %edi
+; X86-NEXT: imull %ecx, %edi
; X86-NEXT: addl %edx, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %eax, %ebx
-; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: imull 28(%ebp), %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: movl 32(%ebp), %edx
+; X86-NEXT: imull %edx, %esi
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebp, %ebx
-; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull 32(%ebp)
+; X86-NEXT: movl 16(%ebp), %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull 32(%ebp)
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %ebx, %edx
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: sbbl (%esp), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: sbbl %eax, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 12(%ebp), %ebx
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 20(%ebp), %edi
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: movl 24(%ebp), %ecx
; X86-NEXT: sbbl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %esi, (%eax)
-; X86-NEXT: movl %edi, 4(%eax)
-; X86-NEXT: movl %ebx, 8(%eax)
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %ebx, (%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: addl $136, %esp
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index 6fcebdb5116ddd..fb169a3777fb82 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -22,7 +22,7 @@ define void @f() nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $176, %esp
+; X86-NEXT: subl $160, %esp
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -96,18 +96,16 @@ define void @f() nounwind {
; X86-NEXT: addl $1, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: andl $3, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movb $65, %cl
; X86-NEXT: subb %al, %cl
-; X86-NEXT: movb %cl, %ch
-; X86-NEXT: andb $7, %ch
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: negb %cl
-; X86-NEXT: movsbl %cl, %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %esi
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -117,29 +115,24 @@ define void @f() nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 136(%esp,%esi), %edi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: notb %cl
-; X86-NEXT: movl 128(%esp,%esi), %ebx
-; X86-NEXT: movl 132(%esp,%esi), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: shrl %esi
-; X86-NEXT: shrl %cl, %esi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: movl 112(%esp,%esi), %edi
+; X86-NEXT: movl 116(%esp,%esi), %eax
+; X86-NEXT: movl 120(%esp,%esi), %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %edx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: je .LBB0_13
; X86-NEXT: # %bb.11: # %udiv-preheader
-; X86-NEXT: andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: orl %esi, %edi
; X86-NEXT: andl $3, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -148,26 +141,20 @@ define void @f() nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movb %al, %ch
-; X86-NEXT: andb $7, %ch
-; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $15, %al
-; X86-NEXT: movzbl %al, %edx
-; X86-NEXT: movl 80(%esp,%edx), %edi
-; X86-NEXT: movl 84(%esp,%edx), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrl %cl, %esi
-; X86-NEXT: notb %cl
-; X86-NEXT: movl 88(%esp,%edx), %ebx
-; X86-NEXT: addl %ebx, %ebx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 72(%esp,%eax), %ebx
+; X86-NEXT: movl 64(%esp,%eax), %esi
+; X86-NEXT: movl 68(%esp,%eax), %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrdl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
@@ -175,70 +162,69 @@ define void @f() nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $3, %eax
-; X86-NEXT: andl $3, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $3, %edi
+; X86-NEXT: andl $3, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB0_12: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %ebx, %esi
; X86-NEXT: shldl $1, %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: andl $2, %eax
-; X86-NEXT: shrl %eax
-; X86-NEXT: leal (%eax,%edi,2), %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: andl $2, %edx
+; X86-NEXT: shrl %edx
+; X86-NEXT: leal (%edx,%ebx,2), %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: shldl $1, %edx, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl $1, %eax, %edx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %eax, %eax
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $3, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: sbbl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: sbbl %ecx, %esi
-; X86-NEXT: shll $30, %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: sarl $30, %eax
-; X86-NEXT: sarl $31, %esi
-; X86-NEXT: shrdl $1, %esi, %eax
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: andl $1, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $3, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: shll $30, %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: sarl $30, %edi
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: shrdl $1, %edx, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %esi, %edx
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: subl %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %esi, %ebx
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: subl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edx, %esi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: andl $3, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: adcl $-1, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl $3, %esi
-; X86-NEXT: andl $3, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $3, %edi
+; X86-NEXT: andl $3, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %eax
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %edx, %eax
; X86-NEXT: jne .LBB0_12
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index df3c25a8c42ad4..6be79edbe51e10 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -13,26 +13,24 @@ define i256 @test1(i256 %a) nounwind {
; ILP-LABEL: test1:
; ILP: # %bb.0:
; ILP-NEXT: movq %rdi, %rax
+; ILP-NEXT: xorps %xmm0, %xmm0
+; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: leal (%rsi,%rsi), %ecx
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT: addb $3, %cl
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: addb $3, %cl
; ILP-NEXT: movl %ecx, %edx
; ILP-NEXT: shrb $3, %dl
-; ILP-NEXT: andb $7, %cl
+; ILP-NEXT: andb $24, %dl
; ILP-NEXT: negb %dl
; ILP-NEXT: movsbq %dl, %rdx
-; ILP-NEXT: movq -16(%rsp,%rdx), %rsi
-; ILP-NEXT: movq -8(%rsp,%rdx), %rdi
+; ILP-NEXT: movq -24(%rsp,%rdx), %rsi
+; ILP-NEXT: movq -16(%rsp,%rdx), %rdi
; ILP-NEXT: shldq %cl, %rsi, %rdi
-; ILP-NEXT: movq -32(%rsp,%rdx), %r8
-; ILP-NEXT: movq -24(%rsp,%rdx), %rdx
+; ILP-NEXT: movq -40(%rsp,%rdx), %r8
+; ILP-NEXT: movq -32(%rsp,%rdx), %rdx
; ILP-NEXT: movq %r8, %r9
; ILP-NEXT: shlq %cl, %r9
; ILP-NEXT: movq %rdx, %r10
@@ -52,27 +50,25 @@ define i256 @test1(i256 %a) nounwind {
; HYBRID-LABEL: test1:
; HYBRID: # %bb.0:
; HYBRID-NEXT: movq %rdi, %rax
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT: xorps %xmm0, %xmm0
+; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; HYBRID-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: addl %esi, %esi
-; HYBRID-NEXT: addb $3, %sil
-; HYBRID-NEXT: movl %esi, %ecx
-; HYBRID-NEXT: andb $7, %cl
-; HYBRID-NEXT: shrb $3, %sil
-; HYBRID-NEXT: negb %sil
-; HYBRID-NEXT: movsbq %sil, %rdx
-; HYBRID-NEXT: movq -16(%rsp,%rdx), %rsi
-; HYBRID-NEXT: movq -8(%rsp,%rdx), %rdi
+; HYBRID-NEXT: leal (%rsi,%rsi), %ecx
+; HYBRID-NEXT: addb $3, %cl
+; HYBRID-NEXT: movl %ecx, %edx
+; HYBRID-NEXT: shrb $3, %dl
+; HYBRID-NEXT: andb $24, %dl
+; HYBRID-NEXT: negb %dl
+; HYBRID-NEXT: movsbq %dl, %rdx
+; HYBRID-NEXT: movq -24(%rsp,%rdx), %rsi
+; HYBRID-NEXT: movq -16(%rsp,%rdx), %rdi
; HYBRID-NEXT: shldq %cl, %rsi, %rdi
; HYBRID-NEXT: movq %rdi, 24(%rax)
-; HYBRID-NEXT: movq -32(%rsp,%rdx), %rdi
-; HYBRID-NEXT: movq -24(%rsp,%rdx), %rdx
+; HYBRID-NEXT: movq -40(%rsp,%rdx), %rdi
+; HYBRID-NEXT: movq -32(%rsp,%rdx), %rdx
; HYBRID-NEXT: movq %rdx, %r8
; HYBRID-NEXT: shldq %cl, %rdi, %r8
; HYBRID-NEXT: movq %r8, 8(%rax)
@@ -81,6 +77,7 @@ define i256 @test1(i256 %a) nounwind {
; HYBRID-NEXT: shlq %cl, %rsi
; HYBRID-NEXT: notb %cl
; HYBRID-NEXT: shrq %rdx
+; HYBRID-NEXT: # kill: def $cl killed $cl killed $ecx
; HYBRID-NEXT: shrq %cl, %rdx
; HYBRID-NEXT: orq %rsi, %rdx
; HYBRID-NEXT: movq %rdx, 16(%rax)
@@ -89,27 +86,25 @@ define i256 @test1(i256 %a) nounwind {
; BURR-LABEL: test1:
; BURR: # %bb.0:
; BURR-NEXT: movq %rdi, %rax
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT: xorps %xmm0, %xmm0
+; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; BURR-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: addl %esi, %esi
-; BURR-NEXT: addb $3, %sil
-; BURR-NEXT: movl %esi, %ecx
-; BURR-NEXT: andb $7, %cl
-; BURR-NEXT: shrb $3, %sil
-; BURR-NEXT: negb %sil
-; BURR-NEXT: movsbq %sil, %rdx
-; BURR-NEXT: movq -16(%rsp,%rdx), %rsi
-; BURR-NEXT: movq -8(%rsp,%rdx), %rdi
+; BURR-NEXT: leal (%rsi,%rsi), %ecx
+; BURR-NEXT: addb $3, %cl
+; BURR-NEXT: movl %ecx, %edx
+; BURR-NEXT: shrb $3, %dl
+; BURR-NEXT: andb $24, %dl
+; BURR-NEXT: negb %dl
+; BURR-NEXT: movsbq %dl, %rdx
+; BURR-NEXT: movq -24(%rsp,%rdx), %rsi
+; BURR-NEXT: movq -16(%rsp,%rdx), %rdi
; BURR-NEXT: shldq %cl, %rsi, %rdi
; BURR-NEXT: movq %rdi, 24(%rax)
-; BURR-NEXT: movq -32(%rsp,%rdx), %rdi
-; BURR-NEXT: movq -24(%rsp,%rdx), %rdx
+; BURR-NEXT: movq -40(%rsp,%rdx), %rdi
+; BURR-NEXT: movq -32(%rsp,%rdx), %rdx
; BURR-NEXT: movq %rdx, %r8
; BURR-NEXT: shldq %cl, %rdi, %r8
; BURR-NEXT: movq %r8, 8(%rax)
@@ -118,6 +113,7 @@ define i256 @test1(i256 %a) nounwind {
; BURR-NEXT: shlq %cl, %rsi
; BURR-NEXT: notb %cl
; BURR-NEXT: shrq %rdx
+; BURR-NEXT: # kill: def $cl killed $cl killed $ecx
; BURR-NEXT: shrq %cl, %rdx
; BURR-NEXT: orq %rsi, %rdx
; BURR-NEXT: movq %rdx, 16(%rax)
@@ -126,33 +122,31 @@ define i256 @test1(i256 %a) nounwind {
; SRC-LABEL: test1:
; SRC: # %bb.0:
; SRC-NEXT: movq %rdi, %rax
-; SRC-NEXT: addl %esi, %esi
-; SRC-NEXT: addb $3, %sil
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT: leal (%rsi,%rsi), %edx
+; SRC-NEXT: addb $3, %dl
+; SRC-NEXT: xorps %xmm0, %xmm0
+; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; SRC-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movl %esi, %edx
-; SRC-NEXT: andb $7, %dl
-; SRC-NEXT: shrb $3, %sil
-; SRC-NEXT: negb %sil
-; SRC-NEXT: movsbq %sil, %rsi
-; SRC-NEXT: movq -16(%rsp,%rsi), %rdi
+; SRC-NEXT: movl %edx, %ecx
+; SRC-NEXT: shrb $3, %cl
+; SRC-NEXT: andb $24, %cl
+; SRC-NEXT: negb %cl
+; SRC-NEXT: movsbq %cl, %rsi
+; SRC-NEXT: movq -24(%rsp,%rsi), %rdi
; SRC-NEXT: movq %rdi, %r8
; SRC-NEXT: movl %edx, %ecx
; SRC-NEXT: shlq %cl, %r8
; SRC-NEXT: notb %cl
-; SRC-NEXT: movq -32(%rsp,%rsi), %r9
-; SRC-NEXT: movq -24(%rsp,%rsi), %r10
+; SRC-NEXT: movq -40(%rsp,%rsi), %r9
+; SRC-NEXT: movq -32(%rsp,%rsi), %r10
; SRC-NEXT: movq %r10, %r11
; SRC-NEXT: shrq %r11
; SRC-NEXT: shrq %cl, %r11
; SRC-NEXT: orq %r8, %r11
-; SRC-NEXT: movq -8(%rsp,%rsi), %rsi
+; SRC-NEXT: movq -16(%rsp,%rsi), %rsi
; SRC-NEXT: movl %edx, %ecx
; SRC-NEXT: shldq %cl, %rdi, %rsi
; SRC-NEXT: movq %r9, %rdi
@@ -171,27 +165,25 @@ define i256 @test1(i256 %a) nounwind {
; LIN-NEXT: addb $3, %dl
; LIN-NEXT: movl %edx, %ecx
; LIN-NEXT: shrb $3, %cl
+; LIN-NEXT: andb $24, %cl
; LIN-NEXT: negb %cl
; LIN-NEXT: movsbq %cl, %rsi
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT: xorps %xmm0, %xmm0
+; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; LIN-NEXT: movq $1, -{{[0-9]+}}(%rsp)
; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq -32(%rsp,%rsi), %rdi
-; LIN-NEXT: andb $7, %dl
+; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT: movq -40(%rsp,%rsi), %rdi
; LIN-NEXT: movq %rdi, %r8
; LIN-NEXT: movl %edx, %ecx
; LIN-NEXT: shlq %cl, %r8
; LIN-NEXT: movq %r8, (%rax)
-; LIN-NEXT: movq -24(%rsp,%rsi), %r8
+; LIN-NEXT: movq -32(%rsp,%rsi), %r8
; LIN-NEXT: movq %r8, %r9
; LIN-NEXT: shldq %cl, %rdi, %r9
; LIN-NEXT: movq %r9, 8(%rax)
-; LIN-NEXT: movq -16(%rsp,%rsi), %rdi
+; LIN-NEXT: movq -24(%rsp,%rsi), %rdi
; LIN-NEXT: movq %rdi, %r9
; LIN-NEXT: shlq %cl, %r9
; LIN-NEXT: shrq %r8
@@ -199,7 +191,7 @@ define i256 @test1(i256 %a) nounwind {
; LIN-NEXT: shrq %cl, %r8
; LIN-NEXT: orq %r9, %r8
; LIN-NEXT: movq %r8, 16(%rax)
-; LIN-NEXT: movq -8(%rsp,%rsi), %rsi
+; LIN-NEXT: movq -16(%rsp,%rsi), %rsi
; LIN-NEXT: movl %edx, %ecx
; LIN-NEXT: shldq %cl, %rdi, %rsi
; LIN-NEXT: movq %rsi, 24(%rax)
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 4fbe05cd1b2f2f..767bd772ab7a3e 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -10,49 +10,45 @@ define void @test_lshr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-LABEL: test_lshr_i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $32, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $48, %esp
+; i686-NEXT: movl 24(%ebp), %ecx
+; i686-NEXT: movl 8(%ebp), %eax
+; i686-NEXT: movl 12(%ebp), %edx
+; i686-NEXT: movl 16(%ebp), %esi
+; i686-NEXT: movl 20(%ebp), %edi
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, (%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, %eax
-; i686-NEXT: andb $7, %al
-; i686-NEXT: shrb $3, %cl
-; i686-NEXT: andb $15, %cl
-; i686-NEXT: movzbl %cl, %ebp
-; i686-NEXT: movl 4(%esp,%ebp), %edx
-; i686-NEXT: movl %edx, %esi
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: notb %cl
-; i686-NEXT: movl 8(%esp,%ebp), %ebx
-; i686-NEXT: leal (%ebx,%ebx), %edi
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: orl %esi, %edi
-; i686-NEXT: movl (%esp,%ebp), %esi
-; i686-NEXT: movl 12(%esp,%ebp), %ebp
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrdl %cl, %ebp, %ebx
-; i686-NEXT: shrdl %cl, %edx, %esi
-; i686-NEXT: shrl %cl, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl %ebp, 12(%eax)
-; i686-NEXT: movl %ebx, 8(%eax)
-; i686-NEXT: movl %esi, (%eax)
-; i686-NEXT: movl %edi, 4(%eax)
-; i686-NEXT: addl $32, %esp
+; i686-NEXT: shrb $3, %al
+; i686-NEXT: andb $12, %al
+; i686-NEXT: movzbl %al, %edi
+; i686-NEXT: movl 8(%esp,%edi), %eax
+; i686-NEXT: movl 4(%esp,%edi), %ebx
+; i686-NEXT: movl %ebx, %edx
+; i686-NEXT: shrdl %cl, %eax, %edx
+; i686-NEXT: movl (%esp,%edi), %esi
+; i686-NEXT: movl 12(%esp,%edi), %edi
+; i686-NEXT: shrdl %cl, %edi, %eax
+; i686-NEXT: shrdl %cl, %ebx, %esi
+; i686-NEXT: movl 40(%ebp), %ebx
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: shrl %cl, %edi
+; i686-NEXT: movl %edi, 12(%ebx)
+; i686-NEXT: movl %eax, 8(%ebx)
+; i686-NEXT: movl %edx, 4(%ebx)
+; i686-NEXT: movl %esi, (%ebx)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -81,50 +77,46 @@ define void @test_ashr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-LABEL: test_ashr_i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $32, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $48, %esp
+; i686-NEXT: movl 24(%ebp), %ecx
+; i686-NEXT: movl 8(%ebp), %eax
+; i686-NEXT: movl 12(%ebp), %edx
+; i686-NEXT: movl 16(%ebp), %esi
+; i686-NEXT: movl 20(%ebp), %edi
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, (%esp)
-; i686-NEXT: sarl $31, %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: sarl $31, %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, %eax
-; i686-NEXT: andb $7, %al
-; i686-NEXT: shrb $3, %cl
-; i686-NEXT: andb $15, %cl
-; i686-NEXT: movzbl %cl, %ebp
-; i686-NEXT: movl 4(%esp,%ebp), %edx
-; i686-NEXT: movl %edx, %esi
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: notb %cl
-; i686-NEXT: movl 8(%esp,%ebp), %ebx
-; i686-NEXT: leal (%ebx,%ebx), %edi
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: orl %esi, %edi
-; i686-NEXT: movl (%esp,%ebp), %esi
-; i686-NEXT: movl 12(%esp,%ebp), %ebp
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrdl %cl, %ebp, %ebx
-; i686-NEXT: shrdl %cl, %edx, %esi
-; i686-NEXT: sarl %cl, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl %ebp, 12(%eax)
-; i686-NEXT: movl %ebx, 8(%eax)
-; i686-NEXT: movl %esi, (%eax)
-; i686-NEXT: movl %edi, 4(%eax)
-; i686-NEXT: addl $32, %esp
+; i686-NEXT: shrb $3, %al
+; i686-NEXT: andb $12, %al
+; i686-NEXT: movzbl %al, %edi
+; i686-NEXT: movl 8(%esp,%edi), %eax
+; i686-NEXT: movl 4(%esp,%edi), %ebx
+; i686-NEXT: movl %ebx, %edx
+; i686-NEXT: shrdl %cl, %eax, %edx
+; i686-NEXT: movl (%esp,%edi), %esi
+; i686-NEXT: movl 12(%esp,%edi), %edi
+; i686-NEXT: shrdl %cl, %edi, %eax
+; i686-NEXT: shrdl %cl, %ebx, %esi
+; i686-NEXT: movl 40(%ebp), %ebx
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: sarl %cl, %edi
+; i686-NEXT: movl %edi, 12(%ebx)
+; i686-NEXT: movl %eax, 8(%ebx)
+; i686-NEXT: movl %edx, 4(%ebx)
+; i686-NEXT: movl %esi, (%ebx)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -154,15 +146,17 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-LABEL: test_shl_i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $32, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $48, %esp
+; i686-NEXT: movl 24(%ebp), %ecx
+; i686-NEXT: movl 8(%ebp), %eax
+; i686-NEXT: movl 12(%ebp), %edx
+; i686-NEXT: movl 16(%ebp), %esi
+; i686-NEXT: movl 20(%ebp), %edi
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -172,36 +166,27 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, (%esp)
; i686-NEXT: movl %ecx, %eax
-; i686-NEXT: andb $7, %al
-; i686-NEXT: shrb $3, %cl
-; i686-NEXT: andb $15, %cl
-; i686-NEXT: negb %cl
-; i686-NEXT: movsbl %cl, %ebp
-; i686-NEXT: movl 24(%esp,%ebp), %ebx
-; i686-NEXT: movl %ebx, %edx
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: notb %cl
-; i686-NEXT: movl 20(%esp,%ebp), %edi
-; i686-NEXT: movl %edi, %esi
-; i686-NEXT: shrl %esi
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: orl %edx, %esi
-; i686-NEXT: movl 16(%esp,%ebp), %edx
-; i686-NEXT: movl 28(%esp,%ebp), %ebp
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shldl %cl, %ebx, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %ebp, 12(%ecx)
-; i686-NEXT: movl %edx, %ebx
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shll %cl, %ebx
-; i686-NEXT: shldl %cl, %edx, %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl %edi, 4(%eax)
-; i686-NEXT: movl %ebx, (%eax)
-; i686-NEXT: movl %esi, 8(%eax)
-; i686-NEXT: addl $32, %esp
+; i686-NEXT: shrb $3, %al
+; i686-NEXT: andb $12, %al
+; i686-NEXT: negb %al
+; i686-NEXT: movsbl %al, %edi
+; i686-NEXT: movl 20(%esp,%edi), %eax
+; i686-NEXT: movl 24(%esp,%edi), %ebx
+; i686-NEXT: movl %ebx, %esi
+; i686-NEXT: shldl %cl, %eax, %esi
+; i686-NEXT: movl 16(%esp,%edi), %edx
+; i686-NEXT: movl 28(%esp,%edi), %edi
+; i686-NEXT: shldl %cl, %ebx, %edi
+; i686-NEXT: movl 40(%ebp), %ebx
+; i686-NEXT: movl %edi, 12(%ebx)
+; i686-NEXT: movl %esi, 8(%ebx)
+; i686-NEXT: movl %edx, %esi
+; i686-NEXT: shll %cl, %esi
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: shldl %cl, %edx, %eax
+; i686-NEXT: movl %eax, 4(%ebx)
+; i686-NEXT: movl %esi, (%ebx)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -264,104 +249,93 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-LABEL: test_lshr_v2i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $100, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $112, %esp
+; i686-NEXT: movl 40(%ebp), %edx
+; i686-NEXT: movl 24(%ebp), %eax
+; i686-NEXT: movl 28(%ebp), %ecx
+; i686-NEXT: movl 32(%ebp), %esi
+; i686-NEXT: movl 20(%ebp), %edi
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 16(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 12(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 8(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 36(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %esi, %ecx
-; i686-NEXT: andl $7, %ecx
+; i686-NEXT: movl %edx, %ebx
+; i686-NEXT: andl $31, %ebx
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: movl 40(%esp,%edx), %eax
+; i686-NEXT: movl 36(%esp,%edx), %esi
+; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %ebx, %ecx
+; i686-NEXT: shrdl %cl, %eax, %esi
+; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 32(%esp,%edx), %ecx
; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shrl $3, %esi
-; i686-NEXT: andl $15, %esi
-; i686-NEXT: movl 40(%esp,%esi), %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shrl %cl, %eax
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 44(%esp,%esi), %edx
-; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: addl %edx, %edx
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: orl %eax, %edx
+; i686-NEXT: movl 44(%esp,%edx), %edx
; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 36(%esp,%esi), %eax
+; i686-NEXT: movl %ebx, %ecx
+; i686-NEXT: movl %ebx, %esi
+; i686-NEXT: shrdl %cl, %edx, %eax
; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, %edx
-; i686-NEXT: andl $7, %edx
-; i686-NEXT: shrl $3, %ebx
-; i686-NEXT: andl $15, %ebx
-; i686-NEXT: movl 72(%esp,%ebx), %ebp
-; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: shrl %cl, %ebp
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 76(%esp,%ebx), %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: leal (%eax,%eax), %edi
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: orl %ebp, %edi
-; i686-NEXT: movl 48(%esp,%esi), %esi
-; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT: movl 56(%ebp), %edx
+; i686-NEXT: movl %edx, %eax
+; i686-NEXT: andl $31, %eax
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: movl 72(%esp,%edx), %ebx
+; i686-NEXT: movl 68(%esp,%edx), %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: shrdl %cl, %ebx, %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 64(%esp,%edx), %edi
+; i686-NEXT: movl 76(%esp,%edx), %edx
+; i686-NEXT: shrdl %cl, %edx, %ebx
+; i686-NEXT: movl %esi, %ecx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: movl 68(%esp,%ebx), %ecx
-; i686-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; i686-NEXT: movl 80(%esp,%ebx), %esi
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT: shrdl %cl, %esi, %ebx
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT: shrdl %cl, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT: shrl %cl, %ebp
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %esi, 28(%ecx)
-; i686-NEXT: movl %ebx, 24(%ecx)
-; i686-NEXT: movl (%esp), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 16(%ecx)
-; i686-NEXT: movl %ebp, 12(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 8(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, (%ecx)
-; i686-NEXT: movl %edi, 20(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 4(%ecx)
-; i686-NEXT: addl $100, %esp
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; i686-NEXT: shrdl %cl, %esi, %edi
+; i686-NEXT: shrl %cl, %edx
+; i686-NEXT: movl 72(%ebp), %eax
+; i686-NEXT: movl %edx, 28(%eax)
+; i686-NEXT: movl %ebx, 24(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 20(%eax)
+; i686-NEXT: movl %edi, 16(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 12(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 8(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 4(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, (%eax)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -402,107 +376,96 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-LABEL: test_ashr_v2i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $92, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: sarl $31, %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $112, %esp
+; i686-NEXT: movl 40(%ebp), %edx
+; i686-NEXT: movl 24(%ebp), %eax
+; i686-NEXT: movl 28(%ebp), %ecx
+; i686-NEXT: movl 32(%ebp), %esi
+; i686-NEXT: movl 16(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 12(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 8(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 20(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: sarl $31, %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 36(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; i686-NEXT: sarl $31, %eax
-; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edi, %ebx
-; i686-NEXT: andl $7, %ebx
-; i686-NEXT: shrl $3, %edi
-; i686-NEXT: andl $15, %edi
-; i686-NEXT: movl 32(%esp,%edi), %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: shrl %cl, %eax
+; i686-NEXT: sarl $31, %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edx, %eax
+; i686-NEXT: andl $31, %eax
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: movl 40(%esp,%edx), %esi
+; i686-NEXT: movl 36(%esp,%edx), %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: shrdl %cl, %esi, %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 32(%esp,%edx), %ecx
+; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 44(%esp,%edx), %edx
+; i686-NEXT: movl %edx, (%esp) # 4-byte Spill
+; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: shrdl %cl, %edx, %esi
+; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 56(%ebp), %edx
+; i686-NEXT: movl %edx, %ebx
+; i686-NEXT: andl $31, %ebx
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: movl 72(%esp,%edx), %esi
+; i686-NEXT: movl 68(%esp,%edx), %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 36(%esp,%edi), %edx
-; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: addl %edx, %edx
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: orl %eax, %edx
-; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ebp, %eax
-; i686-NEXT: movl %ebp, %edx
-; i686-NEXT: andl $7, %edx
-; i686-NEXT: shrl $3, %eax
-; i686-NEXT: andl $15, %eax
-; i686-NEXT: movl 64(%esp,%eax), %ebp
-; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %eax, (%esp) # 4-byte Spill
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: shrl %cl, %ebp
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 68(%esp,%eax), %esi
-; i686-NEXT: leal (%esi,%esi), %eax
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %eax
-; i686-NEXT: orl %ebp, %eax
-; i686-NEXT: movl 28(%esp,%edi), %ecx
+; i686-NEXT: shrdl %cl, %esi, %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 64(%esp,%edx), %ecx
; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 40(%esp,%edi), %edi
+; i686-NEXT: movl 76(%esp,%edx), %edx
; i686-NEXT: movl %ebx, %ecx
+; i686-NEXT: shrdl %cl, %edx, %esi
+; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; i686-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT: movl 60(%esp,%ecx), %ebp
-; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 72(%esp,%ecx), %ebp
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: shrdl %cl, %ebp, %esi
-; i686-NEXT: movl %esi, (%esp) # 4-byte Spill
+; i686-NEXT: sarl %cl, (%esp) # 4-byte Folded Spill
; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: sarl %cl, %edi
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT: shrdl %cl, %esi, %ebx
-; i686-NEXT: sarl %cl, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %ebp, 28(%ecx)
-; i686-NEXT: movl (%esp), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 24(%ecx)
-; i686-NEXT: movl %ebx, 16(%ecx)
-; i686-NEXT: movl %edi, 12(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 8(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, (%ecx)
-; i686-NEXT: movl %eax, 20(%ecx)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 4(%ecx)
-; i686-NEXT: addl $92, %esp
+; i686-NEXT: shrdl %cl, %eax, %edi
+; i686-NEXT: sarl %cl, %edx
+; i686-NEXT: movl 72(%ebp), %eax
+; i686-NEXT: movl %edx, 28(%eax)
+; i686-NEXT: movl %esi, 24(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 20(%eax)
+; i686-NEXT: movl %edi, 16(%eax)
+; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 12(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 8(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 4(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, (%eax)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -546,112 +509,106 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
; i686-LABEL: test_shl_v2i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $100, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $128, %esp
+; i686-NEXT: movl 40(%ebp), %edi
+; i686-NEXT: movl 24(%ebp), %eax
+; i686-NEXT: movl 28(%ebp), %ecx
+; i686-NEXT: movl 32(%ebp), %edx
+; i686-NEXT: movl 20(%ebp), %esi
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 16(%ebp), %esi
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 12(%ebp), %esi
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 8(%ebp), %esi
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 36(%ebp), %esi
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebp, %ecx
-; i686-NEXT: shrl $3, %ebp
-; i686-NEXT: andl $15, %ebp
+; i686-NEXT: movl %edi, %ebx
+; i686-NEXT: shrl $3, %ebx
+; i686-NEXT: andl $12, %ebx
; i686-NEXT: leal {{[0-9]+}}(%esp), %eax
-; i686-NEXT: subl %ebp, %eax
+; i686-NEXT: subl %ebx, %eax
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl 8(%eax), %edx
-; i686-NEXT: movl %edx, (%esp) # 4-byte Spill
-; i686-NEXT: andl $7, %ecx
+; i686-NEXT: movl (%eax), %esi
+; i686-NEXT: movl 4(%eax), %edx
+; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 8(%eax), %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %edi, %ecx
+; i686-NEXT: andl $31, %ecx
; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: movl 4(%eax), %esi
-; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shrl %esi
-; i686-NEXT: notl %ecx
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: orl %edx, %esi
-; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl (%eax), %eax
+; i686-NEXT: shldl %cl, %edx, %eax
; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ebx, %edx
+; i686-NEXT: movl 56(%ebp), %eax
+; i686-NEXT: movl %eax, %edx
; i686-NEXT: shrl $3, %edx
-; i686-NEXT: andl $15, %edx
-; i686-NEXT: leal {{[0-9]+}}(%esp), %esi
-; i686-NEXT: subl %edx, %esi
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; i686-NEXT: subl %edx, %ecx
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: andl $7, %ebx
-; i686-NEXT: movl 8(%esi), %edi
+; i686-NEXT: movl (%ecx), %edi
; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: movl 4(%esi), %eax
+; i686-NEXT: movl 4(%ecx), %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 8(%ecx), %ecx
+; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: andl $31, %eax
; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shrl %eax
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: notl %ecx
+; i686-NEXT: movl %ecx, %eax
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shrl %cl, %eax
-; i686-NEXT: orl %edi, %eax
-; i686-NEXT: movl (%esi), %ecx
-; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT: movl %esi, %edi
+; i686-NEXT: shldl %cl, %edi, %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %esi, %eax
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: shll %cl, %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: negl %ebp
-; i686-NEXT: movl 64(%esp,%ebp), %esi
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: movl (%esp), %edi # 4-byte Reload
-; i686-NEXT: shldl %cl, %edi, %esi
-; i686-NEXT: movl %esi, (%esp) # 4-byte Spill
+; i686-NEXT: negl %ebx
+; i686-NEXT: movl 76(%esp,%ebx), %ebx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT: movl %esi, %edi
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT: shldl %cl, %esi, %ebp
+; i686-NEXT: shldl %cl, %esi, %ebx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; i686-NEXT: movl %edi, %esi
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: shll %cl, %esi
+; i686-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; i686-NEXT: negl %edx
-; i686-NEXT: movl 96(%esp,%edx), %edx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT: shldl %cl, %ebx, %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %edx, 28(%ecx)
-; i686-NEXT: movl %ebp, 20(%ecx)
-; i686-NEXT: movl %edi, 16(%ecx)
-; i686-NEXT: movl (%esp), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 12(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 4(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, (%ecx)
-; i686-NEXT: movl %eax, 24(%ecx)
+; i686-NEXT: movl 108(%esp,%edx), %edx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 8(%ecx)
-; i686-NEXT: addl $100, %esp
+; i686-NEXT: shldl %cl, %eax, %edx
+; i686-NEXT: movl 72(%ebp), %eax
+; i686-NEXT: movl %edx, 28(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 24(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 20(%eax)
+; i686-NEXT: movl %esi, 16(%eax)
+; i686-NEXT: movl %ebx, 12(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 8(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 4(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, (%eax)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index e1466aebf42258..128e2199fb56f6 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -8,98 +8,78 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-LABEL: shift1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %ebp
+; CHECK-NEXT: movl %esp, %ebp
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $92, %esp
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; CHECK-NEXT: andl $-16, %esp
+; CHECK-NEXT: subl $112, %esp
+; CHECK-NEXT: movl 40(%ebp), %ecx
+; CHECK-NEXT: movl 8(%ebp), %eax
+; CHECK-NEXT: movl 12(%ebp), %edx
+; CHECK-NEXT: movl 16(%ebp), %esi
+; CHECK-NEXT: movl 32(%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl 28(%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl 24(%ebp), %edi
; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl 20(%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl 36(%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT: sarl $31, %esi
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: sarl $31, %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: andb $7, %al
-; CHECK-NEXT: shrb $3, %cl
-; CHECK-NEXT: movzbl %cl, %ebp
-; CHECK-NEXT: movl 32(%esp,%ebp), %esi
+; CHECK-NEXT: shrb $5, %al
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: movl 40(%esp,%eax,4), %edx
+; CHECK-NEXT: movl 36(%esp,%eax,4), %esi
; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl %cl, %esi
-; CHECK-NEXT: movl %eax, %edx
-; CHECK-NEXT: notb %dl
-; CHECK-NEXT: movl 36(%esp,%ebp), %ecx
-; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: leal (%ecx,%ecx), %edi
-; CHECK-NEXT: movl %edx, %ecx
-; CHECK-NEXT: shll %cl, %edi
-; CHECK-NEXT: orl %esi, %edi
-; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 40(%esp,%ebp), %esi
+; CHECK-NEXT: shrdl %cl, %edx, %esi
; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl %cl, %esi
-; CHECK-NEXT: movl 44(%esp,%ebp), %ecx
-; CHECK-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; CHECK-NEXT: leal (%ecx,%ecx), %edi
-; CHECK-NEXT: movl %edx, %ecx
-; CHECK-NEXT: shll %cl, %edi
-; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl 44(%esp,%eax,4), %esi
+; CHECK-NEXT: shrdl %cl, %esi, %edx
+; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 48(%esp,%eax,4), %ebx
+; CHECK-NEXT: shrdl %cl, %ebx, %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 52(%esp,%eax,4), %esi
+; CHECK-NEXT: shrdl %cl, %esi, %ebx
+; CHECK-NEXT: movl 56(%esp,%eax,4), %edx
+; CHECK-NEXT: shrdl %cl, %edx, %esi
+; CHECK-NEXT: movl 32(%esp,%eax,4), %edi
; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 48(%esp,%ebp), %ebx
-; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl %cl, %ebx
-; CHECK-NEXT: movl 52(%esp,%ebp), %edi
-; CHECK-NEXT: leal (%edi,%edi), %esi
-; CHECK-NEXT: movl %edx, %ecx
-; CHECK-NEXT: shll %cl, %esi
-; CHECK-NEXT: orl %ebx, %esi
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; CHECK-NEXT: movl 28(%esp,%ebp), %edx
-; CHECK-NEXT: movl 56(%esp,%ebp), %ebx
-; CHECK-NEXT: shrdl %cl, %ebx, %edi
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; CHECK-NEXT: shrdl %cl, %ebp, %edx
-; CHECK-NEXT: sarl %cl, %ebx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl %ebx, 28(%eax)
-; CHECK-NEXT: movl %edi, 24(%eax)
-; CHECK-NEXT: movl (%esp), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 16(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 8(%eax)
-; CHECK-NEXT: movl %edx, (%eax)
-; CHECK-NEXT: movl %esi, 20(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 12(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 4(%eax)
-; CHECK-NEXT: addl $92, %esp
+; CHECK-NEXT: movl 60(%esp,%eax,4), %eax
+; CHECK-NEXT: shrdl %cl, %eax, %edx
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: sarl %cl, %eax
+; CHECK-NEXT: movl 72(%ebp), %ecx
+; CHECK-NEXT: movl %eax, 28(%ecx)
+; CHECK-NEXT: movl %edx, 24(%ecx)
+; CHECK-NEXT: movl %esi, 20(%ecx)
+; CHECK-NEXT: movl %ebx, 16(%ecx)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, 12(%ecx)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, 8(%ecx)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, 4(%ecx)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, (%ecx)
+; CHECK-NEXT: leal -12(%ebp), %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
@@ -120,42 +100,35 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movb %r8b, %dl
-; CHECK-X64-O0-NEXT: movb %dl, %cl
-; CHECK-X64-O0-NEXT: andb $7, %cl
+; CHECK-X64-O0-NEXT: movb %r8b, %cl
; CHECK-X64-O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-X64-O0-NEXT: shrb $3, %dl
+; CHECK-X64-O0-NEXT: movb %cl, %dl
+; CHECK-X64-O0-NEXT: shrb $6, %dl
; CHECK-X64-O0-NEXT: movzbl %dl, %edx
; CHECK-X64-O0-NEXT: movl %edx, %edi
-; CHECK-X64-O0-NEXT: movq -64(%rsp,%rdi), %rdx
-; CHECK-X64-O0-NEXT: movq -56(%rsp,%rdi), %r8
-; CHECK-X64-O0-NEXT: movq %r8, %r9
-; CHECK-X64-O0-NEXT: shrq %cl, %r9
-; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: notb %cl
-; CHECK-X64-O0-NEXT: movq -48(%rsp,%rdi), %rsi
-; CHECK-X64-O0-NEXT: movq %rsi, %r10
-; CHECK-X64-O0-NEXT: addq %r10, %r10
-; CHECK-X64-O0-NEXT: shlq %cl, %r10
+; CHECK-X64-O0-NEXT: movq -56(%rsp,%rdi,8), %rsi
+; CHECK-X64-O0-NEXT: movq -72(%rsp,%rdi,8), %r8
+; CHECK-X64-O0-NEXT: movq -64(%rsp,%rdi,8), %r9
+; CHECK-X64-O0-NEXT: movq %r9, %rdx
+; CHECK-X64-O0-NEXT: shrdq %cl, %rsi, %rdx
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: orq %r10, %r9
-; CHECK-X64-O0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-X64-O0-NEXT: movq -40(%rsp,%rdi), %rdi
+; CHECK-X64-O0-NEXT: movq -48(%rsp,%rdi,8), %rdi
; CHECK-X64-O0-NEXT: shrdq %cl, %rdi, %rsi
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: shrdq %cl, %r8, %rdx
+; CHECK-X64-O0-NEXT: shrdq %cl, %r9, %r8
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-X64-O0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-X64-O0-NEXT: sarq %cl, %rdi
; CHECK-X64-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; CHECK-X64-O0-NEXT: movq %rdi, 24(%rax)
; CHECK-X64-O0-NEXT: movq %rsi, 16(%rax)
-; CHECK-X64-O0-NEXT: movq %rdx, (%rax)
-; CHECK-X64-O0-NEXT: movq %rcx, 8(%rax)
+; CHECK-X64-O0-NEXT: movq %rdx, 8(%rax)
+; CHECK-X64-O0-NEXT: movq %rcx, (%rax)
; CHECK-X64-O0-NEXT: retq
;
; CHECK-X64-O2-LABEL: shift1:
; CHECK-X64-O2: # %bb.0: # %entry
-; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
@@ -165,29 +138,23 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movl %r8d, %eax
-; CHECK-X64-O2-NEXT: andb $7, %al
-; CHECK-X64-O2-NEXT: shrb $3, %r8b
-; CHECK-X64-O2-NEXT: movzbl %r8b, %edx
-; CHECK-X64-O2-NEXT: movq -64(%rsp,%rdx), %rsi
-; CHECK-X64-O2-NEXT: movq -56(%rsp,%rdx), %rdi
-; CHECK-X64-O2-NEXT: movq %rdi, %r8
-; CHECK-X64-O2-NEXT: movl %eax, %ecx
-; CHECK-X64-O2-NEXT: shrq %cl, %r8
-; CHECK-X64-O2-NEXT: notb %cl
-; CHECK-X64-O2-NEXT: movq -48(%rsp,%rdx), %r10
-; CHECK-X64-O2-NEXT: leaq (%r10,%r10), %r11
-; CHECK-X64-O2-NEXT: shlq %cl, %r11
-; CHECK-X64-O2-NEXT: orq %r8, %r11
-; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx), %rdx
-; CHECK-X64-O2-NEXT: movl %eax, %ecx
-; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %r10
-; CHECK-X64-O2-NEXT: shrdq %cl, %rdi, %rsi
+; CHECK-X64-O2-NEXT: movl %r8d, %ecx
+; CHECK-X64-O2-NEXT: shrb $6, %cl
+; CHECK-X64-O2-NEXT: movzbl %cl, %edx
+; CHECK-X64-O2-NEXT: movq -56(%rsp,%rdx,8), %rsi
+; CHECK-X64-O2-NEXT: movq -72(%rsp,%rdx,8), %rdi
+; CHECK-X64-O2-NEXT: movq -64(%rsp,%rdx,8), %r9
+; CHECK-X64-O2-NEXT: movq %r9, %r10
+; CHECK-X64-O2-NEXT: movl %r8d, %ecx
+; CHECK-X64-O2-NEXT: shrdq %cl, %rsi, %r10
+; CHECK-X64-O2-NEXT: movq -48(%rsp,%rdx,8), %rdx
+; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %rsi
+; CHECK-X64-O2-NEXT: shrdq %cl, %r9, %rdi
; CHECK-X64-O2-NEXT: sarq %cl, %rdx
-; CHECK-X64-O2-NEXT: movq %rdx, 24(%r9)
-; CHECK-X64-O2-NEXT: movq %r10, 16(%r9)
-; CHECK-X64-O2-NEXT: movq %rsi, (%r9)
-; CHECK-X64-O2-NEXT: movq %r11, 8(%r9)
+; CHECK-X64-O2-NEXT: movq %rdx, 24(%rax)
+; CHECK-X64-O2-NEXT: movq %rsi, 16(%rax)
+; CHECK-X64-O2-NEXT: movq %r10, 8(%rax)
+; CHECK-X64-O2-NEXT: movq %rdi, (%rax)
; CHECK-X64-O2-NEXT: retq
entry:
%0 = ashr i256 %x, %a
@@ -199,11 +166,13 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-LABEL: shift2:
; CHECK: # %bb.0:
; CHECK-NEXT: pushl %ebp
+; CHECK-NEXT: movl %esp, %ebp
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $92, %esp
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: andl $-16, %esp
+; CHECK-NEXT: subl $112, %esp
+; CHECK-NEXT: movl 12(%ebp), %ecx
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -220,68 +189,54 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movb %al, %ch
-; CHECK-NEXT: andb $7, %ch
+; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: shrb $3, %al
+; CHECK-NEXT: andb $28, %al
; CHECK-NEXT: negb %al
; CHECK-NEXT: movsbl %al, %eax
-; CHECK-NEXT: movl 68(%esp,%eax), %edx
-; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movb %ch, %cl
-; CHECK-NEXT: shll %cl, %edx
-; CHECK-NEXT: notb %cl
-; CHECK-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; CHECK-NEXT: movl 64(%esp,%eax), %ebp
-; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: shrl %ebp
-; CHECK-NEXT: shrl %cl, %ebp
-; CHECK-NEXT: orl %edx, %ebp
-; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 76(%esp,%eax), %edx
-; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movb %ch, %cl
-; CHECK-NEXT: shll %cl, %edx
-; CHECK-NEXT: movl 72(%esp,%eax), %ebx
-; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: shrl %ebx
-; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-NEXT: shrl %cl, %ebx
-; CHECK-NEXT: orl %edx, %ebx
-; CHECK-NEXT: movl 84(%esp,%eax), %esi
+; CHECK-NEXT: movl 68(%esp,%eax), %esi
; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movb %ch, %cl
-; CHECK-NEXT: shll %cl, %esi
-; CHECK-NEXT: movl 80(%esp,%eax), %edi
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: shrl %edx
-; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-NEXT: shrl %cl, %edx
-; CHECK-NEXT: orl %esi, %edx
-; CHECK-NEXT: movb %ch, %cl
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT: movl 72(%esp,%eax), %edx
+; CHECK-NEXT: movl %edx, %edi
; CHECK-NEXT: shldl %cl, %esi, %edi
-; CHECK-NEXT: movl 60(%esp,%eax), %ebp
-; CHECK-NEXT: movl 88(%esp,%eax), %esi
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-NEXT: shldl %cl, %eax, %esi
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 76(%esp,%eax), %esi
+; CHECK-NEXT: movl %esi, %edi
+; CHECK-NEXT: shldl %cl, %edx, %edi
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 80(%esp,%eax), %edx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: shldl %cl, %esi, %edi
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 84(%esp,%eax), %esi
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: shldl %cl, %edx, %ebx
+; CHECK-NEXT: movl 88(%esp,%eax), %edi
+; CHECK-NEXT: movl %edi, %edx
+; CHECK-NEXT: shldl %cl, %esi, %edx
+; CHECK-NEXT: movl 64(%esp,%eax), %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 92(%esp,%eax), %esi
+; CHECK-NEXT: shldl %cl, %edi, %esi
+; CHECK-NEXT: movl 8(%ebp), %eax
; CHECK-NEXT: movl %esi, 28(%eax)
-; CHECK-NEXT: movl %edi, 20(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-NEXT: movl %esi, 12(%eax)
-; CHECK-NEXT: movl %ebp, %esi
-; CHECK-NEXT: shll %cl, %esi
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; CHECK-NEXT: shldl %cl, %ebp, %edi
-; CHECK-NEXT: movl %edi, 4(%eax)
-; CHECK-NEXT: movl %esi, (%eax)
; CHECK-NEXT: movl %edx, 24(%eax)
-; CHECK-NEXT: movl %ebx, 16(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 8(%eax)
-; CHECK-NEXT: addl $92, %esp
+; CHECK-NEXT: movl %ebx, 20(%eax)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT: movl %edx, 16(%eax)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT: movl %edx, 12(%eax)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT: movl %edx, 8(%eax)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT: movl %edi, %edx
+; CHECK-NEXT: shll %cl, %edx
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT: shldl %cl, %edi, %esi
+; CHECK-NEXT: movl %esi, 4(%eax)
+; CHECK-NEXT: movl %edx, (%eax)
+; CHECK-NEXT: leal -12(%ebp), %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
@@ -299,77 +254,64 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movb %sil, %dl
-; CHECK-X64-O0-NEXT: movb %dl, %cl
-; CHECK-X64-O0-NEXT: andb $7, %cl
+; CHECK-X64-O0-NEXT: movb %sil, %cl
; CHECK-X64-O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-X64-O0-NEXT: movb %cl, %dl
; CHECK-X64-O0-NEXT: shrb $3, %dl
+; CHECK-X64-O0-NEXT: andb $24, %dl
; CHECK-X64-O0-NEXT: negb %dl
-; CHECK-X64-O0-NEXT: movsbq %dl, %rdx
-; CHECK-X64-O0-NEXT: movq -16(%rsp,%rdx), %rsi
-; CHECK-X64-O0-NEXT: movq %rsi, %r10
-; CHECK-X64-O0-NEXT: shlq %cl, %r10
+; CHECK-X64-O0-NEXT: movsbq %dl, %r8
+; CHECK-X64-O0-NEXT: movq -40(%rsp,%r8), %r9
+; CHECK-X64-O0-NEXT: movq -32(%rsp,%r8), %rdx
+; CHECK-X64-O0-NEXT: movq -24(%rsp,%r8), %r10
+; CHECK-X64-O0-NEXT: movq %r10, %rsi
+; CHECK-X64-O0-NEXT: shldq %cl, %rdx, %rsi
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: notb %cl
-; CHECK-X64-O0-NEXT: movq -32(%rsp,%rdx), %r9
-; CHECK-X64-O0-NEXT: movq -24(%rsp,%rdx), %r8
-; CHECK-X64-O0-NEXT: movq %r8, %r11
-; CHECK-X64-O0-NEXT: shrq %r11
-; CHECK-X64-O0-NEXT: shrq %cl, %r11
-; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: orq %r11, %r10
-; CHECK-X64-O0-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-X64-O0-NEXT: movq -8(%rsp,%rdx), %rdx
-; CHECK-X64-O0-NEXT: shldq %cl, %rsi, %rdx
+; CHECK-X64-O0-NEXT: movq -16(%rsp,%r8), %r8
+; CHECK-X64-O0-NEXT: shldq %cl, %r10, %r8
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: movq %r9, %rsi
-; CHECK-X64-O0-NEXT: shlq %cl, %rsi
+; CHECK-X64-O0-NEXT: movq %r9, %r10
+; CHECK-X64-O0-NEXT: shlq %cl, %r10
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: shldq %cl, %r9, %r8
+; CHECK-X64-O0-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-X64-O0-NEXT: shldq %cl, %r9, %rdx
; CHECK-X64-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; CHECK-X64-O0-NEXT: movq %r8, 8(%rdi)
-; CHECK-X64-O0-NEXT: movq %rsi, (%rdi)
-; CHECK-X64-O0-NEXT: movq %rdx, 24(%rdi)
-; CHECK-X64-O0-NEXT: movq %rcx, 16(%rdi)
+; CHECK-X64-O0-NEXT: movq %r8, 24(%rdi)
+; CHECK-X64-O0-NEXT: movq %rsi, 16(%rdi)
+; CHECK-X64-O0-NEXT: movq %rdx, 8(%rdi)
+; CHECK-X64-O0-NEXT: movq %rcx, (%rdi)
; CHECK-X64-O0-NEXT: retq
;
; CHECK-X64-O2-LABEL: shift2:
; CHECK-X64-O2: # %bb.0:
+; CHECK-X64-O2-NEXT: movq %rsi, %rcx
; CHECK-X64-O2-NEXT: movq %rdi, %rax
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: xorps %xmm0, %xmm0
+; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movl %esi, %edx
-; CHECK-X64-O2-NEXT: andb $7, %dl
-; CHECK-X64-O2-NEXT: shrb $3, %sil
-; CHECK-X64-O2-NEXT: negb %sil
-; CHECK-X64-O2-NEXT: movsbq %sil, %rsi
-; CHECK-X64-O2-NEXT: movq -16(%rsp,%rsi), %rdi
-; CHECK-X64-O2-NEXT: movq %rdi, %r8
-; CHECK-X64-O2-NEXT: movl %edx, %ecx
+; CHECK-X64-O2-NEXT: movl %ecx, %edx
+; CHECK-X64-O2-NEXT: shrb $3, %dl
+; CHECK-X64-O2-NEXT: andb $24, %dl
+; CHECK-X64-O2-NEXT: negb %dl
+; CHECK-X64-O2-NEXT: movsbq %dl, %rdx
+; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx), %rsi
+; CHECK-X64-O2-NEXT: movq -32(%rsp,%rdx), %rdi
+; CHECK-X64-O2-NEXT: movq -24(%rsp,%rdx), %r8
+; CHECK-X64-O2-NEXT: movq %r8, %r9
+; CHECK-X64-O2-NEXT: shldq %cl, %rdi, %r9
+; CHECK-X64-O2-NEXT: movq -16(%rsp,%rdx), %rdx
+; CHECK-X64-O2-NEXT: shldq %cl, %r8, %rdx
+; CHECK-X64-O2-NEXT: movq %rsi, %r8
; CHECK-X64-O2-NEXT: shlq %cl, %r8
-; CHECK-X64-O2-NEXT: notb %cl
-; CHECK-X64-O2-NEXT: movq -32(%rsp,%rsi), %r9
-; CHECK-X64-O2-NEXT: movq -24(%rsp,%rsi), %r10
-; CHECK-X64-O2-NEXT: movq %r10, %r11
-; CHECK-X64-O2-NEXT: shrq %r11
-; CHECK-X64-O2-NEXT: shrq %cl, %r11
-; CHECK-X64-O2-NEXT: orq %r8, %r11
-; CHECK-X64-O2-NEXT: movq -8(%rsp,%rsi), %rsi
-; CHECK-X64-O2-NEXT: movl %edx, %ecx
-; CHECK-X64-O2-NEXT: shldq %cl, %rdi, %rsi
-; CHECK-X64-O2-NEXT: movq %r9, %rdi
-; CHECK-X64-O2-NEXT: shlq %cl, %rdi
-; CHECK-X64-O2-NEXT: shldq %cl, %r9, %r10
-; CHECK-X64-O2-NEXT: movq %rsi, 24(%rax)
-; CHECK-X64-O2-NEXT: movq %r10, 8(%rax)
-; CHECK-X64-O2-NEXT: movq %rdi, (%rax)
-; CHECK-X64-O2-NEXT: movq %r11, 16(%rax)
+; CHECK-X64-O2-NEXT: # kill: def $cl killed $cl killed $rcx
+; CHECK-X64-O2-NEXT: shldq %cl, %rsi, %rdi
+; CHECK-X64-O2-NEXT: movq %rdx, 24(%rax)
+; CHECK-X64-O2-NEXT: movq %r9, 16(%rax)
+; CHECK-X64-O2-NEXT: movq %rdi, 8(%rax)
+; CHECK-X64-O2-NEXT: movq %r8, (%rax)
; CHECK-X64-O2-NEXT: retq
{
%b = shl i256 1, %c ; %c must not be a constant
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index e5affd86312efd..277525796824bd 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -646,7 +646,869 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
;
-; X86-SSE2-LABEL: lshr_16bytes:
+; FALLBACK16-LABEL: lshr_16bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $60, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl (%ecx), %edx
+; FALLBACK16-NEXT: movl 4(%ecx), %esi
+; FALLBACK16-NEXT: movl 8(%ecx), %edi
+; FALLBACK16-NEXT: movl 12(%ecx), %ecx
+; FALLBACK16-NEXT: movb (%eax), %ah
+; FALLBACK16-NEXT: movb %ah, %al
+; FALLBACK16-NEXT: shlb $3, %al
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $12, %ah
+; FALLBACK16-NEXT: movzbl %ah, %ebp
+; FALLBACK16-NEXT: movl 20(%esp,%ebp), %esi
+; FALLBACK16-NEXT: movl %esi, %ebx
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl %eax, %edx
+; FALLBACK16-NEXT: notb %dl
+; FALLBACK16-NEXT: movl 24(%esp,%ebp), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %ebx, %edi
+; FALLBACK16-NEXT: movl 16(%esp,%ebp), %ebx
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: addl %esi, %esi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %ebx, %esi
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT: movl 28(%esp,%ebp), %ebx
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %ebp
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl %ebx, 12(%edx)
+; FALLBACK16-NEXT: movl %ebp, 8(%edx)
+; FALLBACK16-NEXT: movl %esi, (%edx)
+; FALLBACK16-NEXT: movl %edi, 4(%edx)
+; FALLBACK16-NEXT: addl $60, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: lshr_16bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $44, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK17-NEXT: movl (%edx), %esi
+; FALLBACK17-NEXT: movl 4(%edx), %edi
+; FALLBACK17-NEXT: movl 8(%edx), %ebx
+; FALLBACK17-NEXT: movl 12(%edx), %edx
+; FALLBACK17-NEXT: movb (%ecx), %ch
+; FALLBACK17-NEXT: movb %ch, %cl
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, (%esp)
+; FALLBACK17-NEXT: andb $12, %ch
+; FALLBACK17-NEXT: movzbl %ch, %ebx
+; FALLBACK17-NEXT: movl 8(%esp,%ebx), %esi
+; FALLBACK17-NEXT: movl (%esp,%ebx), %edx
+; FALLBACK17-NEXT: movl 4(%esp,%ebx), %ebp
+; FALLBACK17-NEXT: movl %ebp, %edi
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK17-NEXT: movl 12(%esp,%ebx), %ebx
+; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi
+; FALLBACK17-NEXT: shrdl %cl, %ebp, %edx
+; FALLBACK17-NEXT: shrl %cl, %ebx
+; FALLBACK17-NEXT: movl %esi, 8(%eax)
+; FALLBACK17-NEXT: movl %ebx, 12(%eax)
+; FALLBACK17-NEXT: movl %edx, (%eax)
+; FALLBACK17-NEXT: movl %edi, 4(%eax)
+; FALLBACK17-NEXT: addl $44, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: lshr_16bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $44, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT: movl (%ecx), %edx
+; FALLBACK18-NEXT: movl 4(%ecx), %esi
+; FALLBACK18-NEXT: movl 8(%ecx), %edi
+; FALLBACK18-NEXT: movl 12(%ecx), %ecx
+; FALLBACK18-NEXT: movzbl (%eax), %ebx
+; FALLBACK18-NEXT: movl %ebx, %eax
+; FALLBACK18-NEXT: shlb $3, %al
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, (%esp)
+; FALLBACK18-NEXT: andb $12, %bl
+; FALLBACK18-NEXT: movzbl %bl, %esi
+; FALLBACK18-NEXT: movl 4(%esp,%esi), %edi
+; FALLBACK18-NEXT: movl 8(%esp,%esi), %ebx
+; FALLBACK18-NEXT: shrxl %eax, %edi, %ebp
+; FALLBACK18-NEXT: movl %eax, %edx
+; FALLBACK18-NEXT: notb %dl
+; FALLBACK18-NEXT: leal (%ebx,%ebx), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT: orl %ebp, %ecx
+; FALLBACK18-NEXT: shrxl %eax, (%esp,%esi), %ebp
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK18-NEXT: orl %ebp, %edi
+; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx
+; FALLBACK18-NEXT: movl 12(%esp,%esi), %esi
+; FALLBACK18-NEXT: shrxl %eax, %esi, %eax
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %edx
+; FALLBACK18-NEXT: orl %ebx, %edx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK18-NEXT: movl %eax, 12(%esi)
+; FALLBACK18-NEXT: movl %edx, 8(%esi)
+; FALLBACK18-NEXT: movl %edi, (%esi)
+; FALLBACK18-NEXT: movl %ecx, 4(%esi)
+; FALLBACK18-NEXT: addl $44, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: lshr_16bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $44, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK19-NEXT: movl (%edx), %esi
+; FALLBACK19-NEXT: movl 4(%edx), %edi
+; FALLBACK19-NEXT: movl 8(%edx), %ebx
+; FALLBACK19-NEXT: movl 12(%edx), %edx
+; FALLBACK19-NEXT: movzbl (%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, (%esp)
+; FALLBACK19-NEXT: andb $12, %al
+; FALLBACK19-NEXT: movzbl %al, %eax
+; FALLBACK19-NEXT: movl 8(%esp,%eax), %ebx
+; FALLBACK19-NEXT: movl (%esp,%eax), %edx
+; FALLBACK19-NEXT: movl 4(%esp,%eax), %esi
+; FALLBACK19-NEXT: movl %esi, %edi
+; FALLBACK19-NEXT: shrdl %cl, %ebx, %edi
+; FALLBACK19-NEXT: movl 12(%esp,%eax), %eax
+; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK19-NEXT: movl %ebx, 8(%ebp)
+; FALLBACK19-NEXT: shrxl %ecx, %eax, %eax
+; FALLBACK19-NEXT: movl %eax, 12(%ebp)
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, (%ebp)
+; FALLBACK19-NEXT: movl %edi, 4(%ebp)
+; FALLBACK19-NEXT: addl $44, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: lshr_16bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $60, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movzbl (%eax), %ecx
+; FALLBACK20-NEXT: movl %ecx, %eax
+; FALLBACK20-NEXT: shlb $3, %al
+; FALLBACK20-NEXT: xorps %xmm1, %xmm1
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $12, %cl
+; FALLBACK20-NEXT: movzbl %cl, %edi
+; FALLBACK20-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl 20(%esp,%edi), %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 24(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: movl 28(%esp,%edi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %esi, %ebp
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %esi, %ebx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: movl %edi, 12(%edx)
+; FALLBACK20-NEXT: movl %ebx, 4(%edx)
+; FALLBACK20-NEXT: movl %ebp, 8(%edx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, (%edx)
+; FALLBACK20-NEXT: addl $60, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: lshr_16bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $44, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT: movups (%edx), %xmm0
+; FALLBACK21-NEXT: movzbl (%ecx), %edx
+; FALLBACK21-NEXT: movl %edx, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: xorps %xmm1, %xmm1
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, (%esp)
+; FALLBACK21-NEXT: andb $12, %dl
+; FALLBACK21-NEXT: movzbl %dl, %ebx
+; FALLBACK21-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK21-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK21-NEXT: movl %ebp, %edi
+; FALLBACK21-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK21-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK21-NEXT: movl %eax, %ebx
+; FALLBACK21-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK21-NEXT: movl %edi, 8(%ebp)
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: shrl %cl, %edx
+; FALLBACK21-NEXT: movl %edx, 12(%ebp)
+; FALLBACK21-NEXT: movl %esi, (%ebp)
+; FALLBACK21-NEXT: addl $44, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: lshr_16bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $44, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: shlb $3, %al
+; FALLBACK22-NEXT: xorps %xmm1, %xmm1
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, (%esp)
+; FALLBACK22-NEXT: andb $12, %cl
+; FALLBACK22-NEXT: movzbl %cl, %edi
+; FALLBACK22-NEXT: shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK22-NEXT: movl %eax, %ecx
+; FALLBACK22-NEXT: notb %cl
+; FALLBACK22-NEXT: movl 4(%esp,%edi), %ebp
+; FALLBACK22-NEXT: movl 8(%esp,%edi), %esi
+; FALLBACK22-NEXT: leal (%ebp,%ebp), %edx
+; FALLBACK22-NEXT: shlxl %ecx, %edx, %edx
+; FALLBACK22-NEXT: orl %ebx, %edx
+; FALLBACK22-NEXT: shrxl %eax, %esi, %ebx
+; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp
+; FALLBACK22-NEXT: movl 12(%esp,%edi), %edi
+; FALLBACK22-NEXT: shrxl %eax, %edi, %eax
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK22-NEXT: orl %ebx, %edi
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ecx, %esi, %ecx
+; FALLBACK22-NEXT: orl %ebp, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK22-NEXT: movl %eax, 12(%esi)
+; FALLBACK22-NEXT: movl %ecx, 4(%esi)
+; FALLBACK22-NEXT: movl %edi, 8(%esi)
+; FALLBACK22-NEXT: movl %edx, (%esi)
+; FALLBACK22-NEXT: addl $44, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: lshr_16bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $44, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT: movups (%edx), %xmm0
+; FALLBACK23-NEXT: movzbl (%ecx), %edx
+; FALLBACK23-NEXT: movl %edx, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: xorps %xmm1, %xmm1
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, (%esp)
+; FALLBACK23-NEXT: andb $12, %dl
+; FALLBACK23-NEXT: movzbl %dl, %ebx
+; FALLBACK23-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK23-NEXT: movl %ebp, %edi
+; FALLBACK23-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK23-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, %ebx
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK23-NEXT: movl %edi, 8(%ebp)
+; FALLBACK23-NEXT: shrxl %ecx, %edx, %edx
+; FALLBACK23-NEXT: movl %edx, 12(%ebp)
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT: movl %esi, (%ebp)
+; FALLBACK23-NEXT: addl $44, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: lshr_16bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $60, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK24-NEXT: movzbl (%eax), %ecx
+; FALLBACK24-NEXT: movl %ecx, %eax
+; FALLBACK24-NEXT: shlb $3, %al
+; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $12, %cl
+; FALLBACK24-NEXT: movzbl %cl, %edi
+; FALLBACK24-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl 20(%esp,%edi), %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 24(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: movl 28(%esp,%edi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %esi, %ebp
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %esi, %ebx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: movl %edi, 12(%edx)
+; FALLBACK24-NEXT: movl %ebx, 4(%edx)
+; FALLBACK24-NEXT: movl %ebp, 8(%edx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, (%edx)
+; FALLBACK24-NEXT: addl $60, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: lshr_16bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $44, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT: vmovups (%edx), %xmm0
+; FALLBACK25-NEXT: movzbl (%ecx), %edx
+; FALLBACK25-NEXT: movl %edx, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK25-NEXT: andb $12, %dl
+; FALLBACK25-NEXT: movzbl %dl, %ebx
+; FALLBACK25-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK25-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK25-NEXT: movl %ebp, %edi
+; FALLBACK25-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK25-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK25-NEXT: movl %eax, %ebx
+; FALLBACK25-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK25-NEXT: movl %edi, 8(%ebp)
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: shrl %cl, %edx
+; FALLBACK25-NEXT: movl %edx, 12(%ebp)
+; FALLBACK25-NEXT: movl %esi, (%ebp)
+; FALLBACK25-NEXT: addl $44, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: lshr_16bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $44, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %eax
+; FALLBACK26-NEXT: shlb $3, %al
+; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK26-NEXT: andb $12, %cl
+; FALLBACK26-NEXT: movzbl %cl, %edi
+; FALLBACK26-NEXT: shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK26-NEXT: movl %eax, %ecx
+; FALLBACK26-NEXT: notb %cl
+; FALLBACK26-NEXT: movl 4(%esp,%edi), %ebp
+; FALLBACK26-NEXT: movl 8(%esp,%edi), %esi
+; FALLBACK26-NEXT: leal (%ebp,%ebp), %edx
+; FALLBACK26-NEXT: shlxl %ecx, %edx, %edx
+; FALLBACK26-NEXT: orl %ebx, %edx
+; FALLBACK26-NEXT: shrxl %eax, %esi, %ebx
+; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp
+; FALLBACK26-NEXT: movl 12(%esp,%edi), %edi
+; FALLBACK26-NEXT: shrxl %eax, %edi, %eax
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK26-NEXT: orl %ebx, %edi
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ecx, %esi, %ecx
+; FALLBACK26-NEXT: orl %ebp, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK26-NEXT: movl %eax, 12(%esi)
+; FALLBACK26-NEXT: movl %ecx, 4(%esi)
+; FALLBACK26-NEXT: movl %edi, 8(%esi)
+; FALLBACK26-NEXT: movl %edx, (%esi)
+; FALLBACK26-NEXT: addl $44, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: lshr_16bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $44, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT: vmovups (%edx), %xmm0
+; FALLBACK27-NEXT: movzbl (%ecx), %edx
+; FALLBACK27-NEXT: movl %edx, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK27-NEXT: andb $12, %dl
+; FALLBACK27-NEXT: movzbl %dl, %ebx
+; FALLBACK27-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK27-NEXT: movl %ebp, %edi
+; FALLBACK27-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK27-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, %ebx
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK27-NEXT: movl %edi, 8(%ebp)
+; FALLBACK27-NEXT: shrxl %ecx, %edx, %edx
+; FALLBACK27-NEXT: movl %edx, 12(%ebp)
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT: movl %esi, (%ebp)
+; FALLBACK27-NEXT: addl $44, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: lshr_16bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $60, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK28-NEXT: movzbl (%eax), %ecx
+; FALLBACK28-NEXT: movl %ecx, %eax
+; FALLBACK28-NEXT: shlb $3, %al
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $12, %cl
+; FALLBACK28-NEXT: movzbl %cl, %edi
+; FALLBACK28-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl 20(%esp,%edi), %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 24(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: movl 28(%esp,%edi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %esi, %ebp
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %esi, %ebx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: movl %edi, 12(%edx)
+; FALLBACK28-NEXT: movl %ebx, 4(%edx)
+; FALLBACK28-NEXT: movl %ebp, 8(%edx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, (%edx)
+; FALLBACK28-NEXT: addl $60, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: lshr_16bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $44, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT: vmovups (%edx), %xmm0
+; FALLBACK29-NEXT: movzbl (%ecx), %edx
+; FALLBACK29-NEXT: movl %edx, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK29-NEXT: andb $12, %dl
+; FALLBACK29-NEXT: movzbl %dl, %ebx
+; FALLBACK29-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK29-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK29-NEXT: movl %ebp, %edi
+; FALLBACK29-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK29-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK29-NEXT: movl %eax, %ebx
+; FALLBACK29-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK29-NEXT: movl %edi, 8(%ebp)
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: shrl %cl, %edx
+; FALLBACK29-NEXT: movl %edx, 12(%ebp)
+; FALLBACK29-NEXT: movl %esi, (%ebp)
+; FALLBACK29-NEXT: addl $44, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: lshr_16bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $44, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %eax
+; FALLBACK30-NEXT: shlb $3, %al
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK30-NEXT: andb $12, %cl
+; FALLBACK30-NEXT: movzbl %cl, %edi
+; FALLBACK30-NEXT: shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK30-NEXT: movl %eax, %ecx
+; FALLBACK30-NEXT: notb %cl
+; FALLBACK30-NEXT: movl 4(%esp,%edi), %ebp
+; FALLBACK30-NEXT: movl 8(%esp,%edi), %esi
+; FALLBACK30-NEXT: leal (%ebp,%ebp), %edx
+; FALLBACK30-NEXT: shlxl %ecx, %edx, %edx
+; FALLBACK30-NEXT: orl %ebx, %edx
+; FALLBACK30-NEXT: shrxl %eax, %esi, %ebx
+; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp
+; FALLBACK30-NEXT: movl 12(%esp,%edi), %edi
+; FALLBACK30-NEXT: shrxl %eax, %edi, %eax
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK30-NEXT: orl %ebx, %edi
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ecx, %esi, %ecx
+; FALLBACK30-NEXT: orl %ebp, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK30-NEXT: movl %eax, 12(%esi)
+; FALLBACK30-NEXT: movl %ecx, 4(%esi)
+; FALLBACK30-NEXT: movl %edi, 8(%esi)
+; FALLBACK30-NEXT: movl %edx, (%esi)
+; FALLBACK30-NEXT: addl $44, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: lshr_16bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $44, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT: vmovups (%edx), %xmm0
+; FALLBACK31-NEXT: movzbl (%ecx), %edx
+; FALLBACK31-NEXT: movl %edx, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK31-NEXT: andb $12, %dl
+; FALLBACK31-NEXT: movzbl %dl, %ebx
+; FALLBACK31-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK31-NEXT: movl %ebp, %edi
+; FALLBACK31-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK31-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, %ebx
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK31-NEXT: movl %edi, 8(%ebp)
+; FALLBACK31-NEXT: shrxl %ecx, %edx, %edx
+; FALLBACK31-NEXT: movl %edx, 12(%ebp)
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT: movl %esi, (%ebp)
+; FALLBACK31-NEXT: addl $44, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: retl
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = lshr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; X64-NO-SHLD-NO-BMI2-LABEL: lshr_16bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax
+; X64-NO-SHLD-NO-BMI2-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rdi,%rdi), %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl
+; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al
+; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %rdi, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %rdi, %rcx
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %rcx, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-LABEL: lshr_16bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: xorl %edi, %edi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-LABEL: lshr_16bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, (%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil
+; X64-NO-SHLD-HAVE-BMI2-NEXT: leaq (%rax,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rdi, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %rax, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %rax, %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_16bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %edi, %edi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
+;
+; X86-SSE2-LABEL: lshr_16bytes_dwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
@@ -660,19 +1522,17 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl 8(%edx), %ebx
; X86-SSE2-NEXT: movl 12(%edx), %edx
; X86-SSE2-NEXT: movzbl (%ecx), %ecx
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %esi, (%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $15, %ecx
-; X86-SSE2-NEXT: movl (%esp,%ecx), %edx
-; X86-SSE2-NEXT: movl 4(%esp,%ecx), %esi
-; X86-SSE2-NEXT: movl 12(%esp,%ecx), %edi
-; X86-SSE2-NEXT: movl 8(%esp,%ecx), %ecx
+; X86-SSE2-NEXT: andl $3, %ecx
+; X86-SSE2-NEXT: movl (%esp,%ecx,4), %edx
+; X86-SSE2-NEXT: movl 4(%esp,%ecx,4), %esi
+; X86-SSE2-NEXT: movl 12(%esp,%ecx,4), %edi
+; X86-SSE2-NEXT: movl 8(%esp,%ecx,4), %ecx
; X86-SSE2-NEXT: movl %ecx, 8(%eax)
; X86-SSE2-NEXT: movl %edi, 12(%eax)
; X86-SSE2-NEXT: movl %edx, (%eax)
@@ -683,46 +1543,47 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: lshr_16bytes:
+; X86-SSE42-LABEL: lshr_16bytes_dwordOff:
; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $32, %esp
+; X86-SSE42-NEXT: subl $44, %esp
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT: movups (%edx), %xmm0
; X86-SSE42-NEXT: movzbl (%ecx), %ecx
; X86-SSE42-NEXT: xorps %xmm1, %xmm1
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
-; X86-SSE42-NEXT: andl $15, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, (%esp)
+; X86-SSE42-NEXT: andl $3, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0
; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $32, %esp
+; X86-SSE42-NEXT: addl $44, %esp
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: lshr_16bytes:
+; X86-AVX-LABEL: lshr_16bytes_dwordOff:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: subl $32, %esp
+; X86-AVX-NEXT: subl $44, %esp
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: vmovups (%edx), %xmm0
; X86-AVX-NEXT: movzbl (%ecx), %ecx
; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovups %xmm0, (%esp)
-; X86-AVX-NEXT: andl $15, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
+; X86-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovaps %xmm0, (%esp)
+; X86-AVX-NEXT: andl $3, %ecx
+; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0
; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $32, %esp
+; X86-AVX-NEXT: addl $44, %esp
; X86-AVX-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
- %byteOff = load i128, ptr %byteOff.ptr, align 1
- %bitOff = shl i128 %byteOff, 3
+ %dwordOff = load i128, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i128 %dwordOff, 5
%res = lshr i128 %src, %bitOff
store i128 %res, ptr %dst, align 1
ret void
}
+
define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-LABEL: shl_16bytes:
; X64-NO-SHLD-NO-BMI2: # %bb.0:
@@ -800,7 +1661,877 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
;
-; X86-SSE2-LABEL: shl_16bytes:
+; FALLBACK16-LABEL: shl_16bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $60, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl (%ecx), %ebx
+; FALLBACK16-NEXT: movl 4(%ecx), %esi
+; FALLBACK16-NEXT: movl 8(%ecx), %edi
+; FALLBACK16-NEXT: movl 12(%ecx), %ecx
+; FALLBACK16-NEXT: movb (%eax), %ah
+; FALLBACK16-NEXT: movb %ah, %dh
+; FALLBACK16-NEXT: shlb $3, %dh
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $12, %ah
+; FALLBACK16-NEXT: negb %ah
+; FALLBACK16-NEXT: movsbl %ah, %ebp
+; FALLBACK16-NEXT: movl 32(%esp,%ebp), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 36(%esp,%ebp), %esi
+; FALLBACK16-NEXT: movl %esi, %edi
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: movb %dh, %dl
+; FALLBACK16-NEXT: notb %dl
+; FALLBACK16-NEXT: shrl %ebx
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: movl 40(%esp,%ebp), %edi
+; FALLBACK16-NEXT: movl %edi, %ebp
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: shrl %esi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: orl %edi, %esi
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: movl %edx, (%eax)
+; FALLBACK16-NEXT: movl %esi, 8(%eax)
+; FALLBACK16-NEXT: movl %ebp, 12(%eax)
+; FALLBACK16-NEXT: movl %ebx, 4(%eax)
+; FALLBACK16-NEXT: addl $60, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: shl_16bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $32, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK17-NEXT: movl (%edx), %esi
+; FALLBACK17-NEXT: movl 4(%edx), %edi
+; FALLBACK17-NEXT: movl 8(%edx), %ebx
+; FALLBACK17-NEXT: movl 12(%edx), %edx
+; FALLBACK17-NEXT: movb (%ecx), %ch
+; FALLBACK17-NEXT: movb %ch, %cl
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, (%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: andb $12, %ch
+; FALLBACK17-NEXT: negb %ch
+; FALLBACK17-NEXT: movsbl %ch, %edi
+; FALLBACK17-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK17-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK17-NEXT: shldl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK17-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK17-NEXT: shldl %cl, %edi, %esi
+; FALLBACK17-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK17-NEXT: shll %cl, %ebx
+; FALLBACK17-NEXT: movl %esi, 8(%eax)
+; FALLBACK17-NEXT: movl %edx, 12(%eax)
+; FALLBACK17-NEXT: movl %ebx, (%eax)
+; FALLBACK17-NEXT: movl %edi, 4(%eax)
+; FALLBACK17-NEXT: addl $32, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: shl_16bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $44, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT: movl (%ecx), %edx
+; FALLBACK18-NEXT: movl 4(%ecx), %esi
+; FALLBACK18-NEXT: movl 8(%ecx), %edi
+; FALLBACK18-NEXT: movl 12(%ecx), %ecx
+; FALLBACK18-NEXT: movzbl (%eax), %eax
+; FALLBACK18-NEXT: movl %eax, %ebx
+; FALLBACK18-NEXT: shlb $3, %bl
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, (%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: andb $12, %al
+; FALLBACK18-NEXT: negb %al
+; FALLBACK18-NEXT: movsbl %al, %edx
+; FALLBACK18-NEXT: movl 16(%esp,%edx), %edi
+; FALLBACK18-NEXT: movl 20(%esp,%edx), %ecx
+; FALLBACK18-NEXT: shlxl %ebx, %ecx, %esi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %ebp
+; FALLBACK18-NEXT: movl %ebx, %eax
+; FALLBACK18-NEXT: notb %al
+; FALLBACK18-NEXT: shrl %edi
+; FALLBACK18-NEXT: shrxl %eax, %edi, %edi
+; FALLBACK18-NEXT: orl %esi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, 28(%esp,%edx), %esi
+; FALLBACK18-NEXT: movl 24(%esp,%edx), %edx
+; FALLBACK18-NEXT: shlxl %ebx, %edx, %ebx
+; FALLBACK18-NEXT: shrl %edx
+; FALLBACK18-NEXT: shrxl %eax, %edx, %edx
+; FALLBACK18-NEXT: orl %esi, %edx
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %eax, %ecx, %eax
+; FALLBACK18-NEXT: orl %ebx, %eax
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT: movl %ebp, (%ecx)
+; FALLBACK18-NEXT: movl %eax, 8(%ecx)
+; FALLBACK18-NEXT: movl %edx, 12(%ecx)
+; FALLBACK18-NEXT: movl %edi, 4(%ecx)
+; FALLBACK18-NEXT: addl $44, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: shl_16bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $44, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK19-NEXT: movl (%edx), %esi
+; FALLBACK19-NEXT: movl 4(%edx), %edi
+; FALLBACK19-NEXT: movl 8(%edx), %ebx
+; FALLBACK19-NEXT: movl 12(%edx), %edx
+; FALLBACK19-NEXT: movzbl (%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, (%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: andb $12, %al
+; FALLBACK19-NEXT: negb %al
+; FALLBACK19-NEXT: movsbl %al, %eax
+; FALLBACK19-NEXT: movl 24(%esp,%eax), %esi
+; FALLBACK19-NEXT: movl 28(%esp,%eax), %edx
+; FALLBACK19-NEXT: shldl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl 16(%esp,%eax), %edi
+; FALLBACK19-NEXT: movl 20(%esp,%eax), %eax
+; FALLBACK19-NEXT: shldl %cl, %eax, %esi
+; FALLBACK19-NEXT: shldl %cl, %edi, %eax
+; FALLBACK19-NEXT: shlxl %ecx, %edi, %ecx
+; FALLBACK19-NEXT: movl %esi, 8(%ebp)
+; FALLBACK19-NEXT: movl %edx, 12(%ebp)
+; FALLBACK19-NEXT: movl %ecx, (%ebp)
+; FALLBACK19-NEXT: movl %eax, 4(%ebp)
+; FALLBACK19-NEXT: addl $44, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: shl_16bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $60, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movzbl (%eax), %ecx
+; FALLBACK20-NEXT: movl %ecx, %eax
+; FALLBACK20-NEXT: shlb $3, %al
+; FALLBACK20-NEXT: xorps %xmm1, %xmm1
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $12, %cl
+; FALLBACK20-NEXT: negb %cl
+; FALLBACK20-NEXT: movsbl %cl, %edi
+; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: movl 40(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %esi
+; FALLBACK20-NEXT: shrl %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: movl 32(%esp,%edi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %edi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %ebp, %edi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: movl %eax, (%edx)
+; FALLBACK20-NEXT: movl %ebp, 4(%edx)
+; FALLBACK20-NEXT: movl %edi, 8(%edx)
+; FALLBACK20-NEXT: movl %esi, 12(%edx)
+; FALLBACK20-NEXT: addl $60, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: shl_16bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $44, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT: movups (%edx), %xmm0
+; FALLBACK21-NEXT: movzbl (%ecx), %edx
+; FALLBACK21-NEXT: movl %edx, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: xorps %xmm1, %xmm1
+; FALLBACK21-NEXT: movaps %xmm1, (%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: andb $12, %dl
+; FALLBACK21-NEXT: negb %dl
+; FALLBACK21-NEXT: movsbl %dl, %edi
+; FALLBACK21-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK21-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK21-NEXT: shldl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK21-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK21-NEXT: shldl %cl, %edi, %esi
+; FALLBACK21-NEXT: movl %ebx, %ebp
+; FALLBACK21-NEXT: shll %cl, %ebp
+; FALLBACK21-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK21-NEXT: movl %edi, 4(%eax)
+; FALLBACK21-NEXT: movl %esi, 8(%eax)
+; FALLBACK21-NEXT: movl %edx, 12(%eax)
+; FALLBACK21-NEXT: movl %ebp, (%eax)
+; FALLBACK21-NEXT: addl $44, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: shl_16bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $44, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: shlb $3, %al
+; FALLBACK22-NEXT: xorps %xmm1, %xmm1
+; FALLBACK22-NEXT: movaps %xmm1, (%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: andb $12, %cl
+; FALLBACK22-NEXT: negb %cl
+; FALLBACK22-NEXT: movsbl %cl, %ecx
+; FALLBACK22-NEXT: shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl 24(%esp,%ecx), %edx
+; FALLBACK22-NEXT: shlxl %eax, %edx, %edi
+; FALLBACK22-NEXT: movl %eax, %ebx
+; FALLBACK22-NEXT: notb %bl
+; FALLBACK22-NEXT: shrl %edx
+; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK22-NEXT: orl %esi, %edx
+; FALLBACK22-NEXT: movl 20(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl %esi, %ebp
+; FALLBACK22-NEXT: shrl %ebp
+; FALLBACK22-NEXT: shrxl %ebx, %ebp, %ebp
+; FALLBACK22-NEXT: orl %edi, %ebp
+; FALLBACK22-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK22-NEXT: movl 16(%esp,%ecx), %ecx
+; FALLBACK22-NEXT: shlxl %eax, %ecx, %eax
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %esi, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK22-NEXT: movl %eax, (%esi)
+; FALLBACK22-NEXT: movl %ecx, 4(%esi)
+; FALLBACK22-NEXT: movl %ebp, 8(%esi)
+; FALLBACK22-NEXT: movl %edx, 12(%esi)
+; FALLBACK22-NEXT: addl $44, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: shl_16bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $44, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT: movups (%edx), %xmm0
+; FALLBACK23-NEXT: movzbl (%ecx), %edx
+; FALLBACK23-NEXT: movl %edx, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: xorps %xmm1, %xmm1
+; FALLBACK23-NEXT: movaps %xmm1, (%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: andb $12, %dl
+; FALLBACK23-NEXT: negb %dl
+; FALLBACK23-NEXT: movsbl %dl, %edi
+; FALLBACK23-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK23-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK23-NEXT: shldl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK23-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK23-NEXT: shldl %cl, %edi, %esi
+; FALLBACK23-NEXT: shlxl %ecx, %ebx, %ebp
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK23-NEXT: movl %edi, 4(%eax)
+; FALLBACK23-NEXT: movl %esi, 8(%eax)
+; FALLBACK23-NEXT: movl %edx, 12(%eax)
+; FALLBACK23-NEXT: movl %ebp, (%eax)
+; FALLBACK23-NEXT: addl $44, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: shl_16bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $60, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK24-NEXT: movzbl (%eax), %ecx
+; FALLBACK24-NEXT: movl %ecx, %eax
+; FALLBACK24-NEXT: shlb $3, %al
+; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $12, %cl
+; FALLBACK24-NEXT: negb %cl
+; FALLBACK24-NEXT: movsbl %cl, %edi
+; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: movl 40(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %esi
+; FALLBACK24-NEXT: shrl %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: movl 32(%esp,%edi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %edi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %ebp, %edi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: movl %eax, (%edx)
+; FALLBACK24-NEXT: movl %ebp, 4(%edx)
+; FALLBACK24-NEXT: movl %edi, 8(%edx)
+; FALLBACK24-NEXT: movl %esi, 12(%edx)
+; FALLBACK24-NEXT: addl $60, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: shl_16bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $44, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT: vmovups (%edx), %xmm0
+; FALLBACK25-NEXT: movzbl (%ecx), %edx
+; FALLBACK25-NEXT: movl %edx, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: andb $12, %dl
+; FALLBACK25-NEXT: negb %dl
+; FALLBACK25-NEXT: movsbl %dl, %edi
+; FALLBACK25-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK25-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK25-NEXT: shldl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK25-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK25-NEXT: shldl %cl, %edi, %esi
+; FALLBACK25-NEXT: movl %ebx, %ebp
+; FALLBACK25-NEXT: shll %cl, %ebp
+; FALLBACK25-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK25-NEXT: movl %edi, 4(%eax)
+; FALLBACK25-NEXT: movl %esi, 8(%eax)
+; FALLBACK25-NEXT: movl %edx, 12(%eax)
+; FALLBACK25-NEXT: movl %ebp, (%eax)
+; FALLBACK25-NEXT: addl $44, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: shl_16bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $44, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %eax
+; FALLBACK26-NEXT: shlb $3, %al
+; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: andb $12, %cl
+; FALLBACK26-NEXT: negb %cl
+; FALLBACK26-NEXT: movsbl %cl, %ecx
+; FALLBACK26-NEXT: shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl 24(%esp,%ecx), %edx
+; FALLBACK26-NEXT: shlxl %eax, %edx, %edi
+; FALLBACK26-NEXT: movl %eax, %ebx
+; FALLBACK26-NEXT: notb %bl
+; FALLBACK26-NEXT: shrl %edx
+; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK26-NEXT: orl %esi, %edx
+; FALLBACK26-NEXT: movl 20(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl %esi, %ebp
+; FALLBACK26-NEXT: shrl %ebp
+; FALLBACK26-NEXT: shrxl %ebx, %ebp, %ebp
+; FALLBACK26-NEXT: orl %edi, %ebp
+; FALLBACK26-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK26-NEXT: movl 16(%esp,%ecx), %ecx
+; FALLBACK26-NEXT: shlxl %eax, %ecx, %eax
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %esi, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK26-NEXT: movl %eax, (%esi)
+; FALLBACK26-NEXT: movl %ecx, 4(%esi)
+; FALLBACK26-NEXT: movl %ebp, 8(%esi)
+; FALLBACK26-NEXT: movl %edx, 12(%esi)
+; FALLBACK26-NEXT: addl $44, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: shl_16bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $44, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT: vmovups (%edx), %xmm0
+; FALLBACK27-NEXT: movzbl (%ecx), %edx
+; FALLBACK27-NEXT: movl %edx, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: andb $12, %dl
+; FALLBACK27-NEXT: negb %dl
+; FALLBACK27-NEXT: movsbl %dl, %edi
+; FALLBACK27-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK27-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK27-NEXT: shldl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK27-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK27-NEXT: shldl %cl, %edi, %esi
+; FALLBACK27-NEXT: shlxl %ecx, %ebx, %ebp
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK27-NEXT: movl %edi, 4(%eax)
+; FALLBACK27-NEXT: movl %esi, 8(%eax)
+; FALLBACK27-NEXT: movl %edx, 12(%eax)
+; FALLBACK27-NEXT: movl %ebp, (%eax)
+; FALLBACK27-NEXT: addl $44, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: shl_16bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $60, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK28-NEXT: movzbl (%eax), %ecx
+; FALLBACK28-NEXT: movl %ecx, %eax
+; FALLBACK28-NEXT: shlb $3, %al
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $12, %cl
+; FALLBACK28-NEXT: negb %cl
+; FALLBACK28-NEXT: movsbl %cl, %edi
+; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: movl 40(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %esi
+; FALLBACK28-NEXT: shrl %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: movl 32(%esp,%edi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %edi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %ebp, %edi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: movl %eax, (%edx)
+; FALLBACK28-NEXT: movl %ebp, 4(%edx)
+; FALLBACK28-NEXT: movl %edi, 8(%edx)
+; FALLBACK28-NEXT: movl %esi, 12(%edx)
+; FALLBACK28-NEXT: addl $60, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: shl_16bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $44, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT: vmovups (%edx), %xmm0
+; FALLBACK29-NEXT: movzbl (%ecx), %edx
+; FALLBACK29-NEXT: movl %edx, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: andb $12, %dl
+; FALLBACK29-NEXT: negb %dl
+; FALLBACK29-NEXT: movsbl %dl, %edi
+; FALLBACK29-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK29-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK29-NEXT: shldl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK29-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK29-NEXT: shldl %cl, %edi, %esi
+; FALLBACK29-NEXT: movl %ebx, %ebp
+; FALLBACK29-NEXT: shll %cl, %ebp
+; FALLBACK29-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK29-NEXT: movl %edi, 4(%eax)
+; FALLBACK29-NEXT: movl %esi, 8(%eax)
+; FALLBACK29-NEXT: movl %edx, 12(%eax)
+; FALLBACK29-NEXT: movl %ebp, (%eax)
+; FALLBACK29-NEXT: addl $44, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: shl_16bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $44, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %eax
+; FALLBACK30-NEXT: shlb $3, %al
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: andb $12, %cl
+; FALLBACK30-NEXT: negb %cl
+; FALLBACK30-NEXT: movsbl %cl, %ecx
+; FALLBACK30-NEXT: shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK30-NEXT: movl 24(%esp,%ecx), %edx
+; FALLBACK30-NEXT: shlxl %eax, %edx, %edi
+; FALLBACK30-NEXT: movl %eax, %ebx
+; FALLBACK30-NEXT: notb %bl
+; FALLBACK30-NEXT: shrl %edx
+; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK30-NEXT: orl %esi, %edx
+; FALLBACK30-NEXT: movl 20(%esp,%ecx), %esi
+; FALLBACK30-NEXT: movl %esi, %ebp
+; FALLBACK30-NEXT: shrl %ebp
+; FALLBACK30-NEXT: shrxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT: orl %edi, %ebp
+; FALLBACK30-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK30-NEXT: movl 16(%esp,%ecx), %ecx
+; FALLBACK30-NEXT: shlxl %eax, %ecx, %eax
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %esi, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK30-NEXT: movl %eax, (%esi)
+; FALLBACK30-NEXT: movl %ecx, 4(%esi)
+; FALLBACK30-NEXT: movl %ebp, 8(%esi)
+; FALLBACK30-NEXT: movl %edx, 12(%esi)
+; FALLBACK30-NEXT: addl $44, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: shl_16bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $44, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT: vmovups (%edx), %xmm0
+; FALLBACK31-NEXT: movzbl (%ecx), %edx
+; FALLBACK31-NEXT: movl %edx, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: andb $12, %dl
+; FALLBACK31-NEXT: negb %dl
+; FALLBACK31-NEXT: movsbl %dl, %edi
+; FALLBACK31-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK31-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK31-NEXT: shldl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK31-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK31-NEXT: shldl %cl, %edi, %esi
+; FALLBACK31-NEXT: shlxl %ecx, %ebx, %ebp
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK31-NEXT: movl %edi, 4(%eax)
+; FALLBACK31-NEXT: movl %esi, 8(%eax)
+; FALLBACK31-NEXT: movl %edx, 12(%eax)
+; FALLBACK31-NEXT: movl %ebp, (%eax)
+; FALLBACK31-NEXT: addl $44, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: retl
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = shl i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; X64-NO-SHLD-NO-BMI2-LABEL: shl_16bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax
+; X64-NO-SHLD-NO-BMI2-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: shrq %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl
+; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: orq %rdi, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al
+; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rcx
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %rcx, (%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-LABEL: shl_16bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shldq %cl, %rax, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: xorl %eax, %eax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-LABEL: shl_16bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, 8(%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, %rax, %r8
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shrq %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rdi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %r8, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %r8, %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-LABEL: shl_16bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shldq %cl, %rax, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, %rax, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rax, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
+;
+; X86-SSE2-LABEL: shl_16bytes_dwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
@@ -814,15 +2545,14 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl 8(%edx), %ebx
; X86-SSE2-NEXT: movl 12(%edx), %edx
; X86-SSE2-NEXT: movzbl (%ecx), %ecx
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, (%esp)
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, (%esp)
-; X86-SSE2-NEXT: andb $15, %cl
+; X86-SSE2-NEXT: shlb $2, %cl
+; X86-SSE2-NEXT: andb $12, %cl
; X86-SSE2-NEXT: negb %cl
; X86-SSE2-NEXT: movsbl %cl, %ecx
; X86-SSE2-NEXT: movl 16(%esp,%ecx), %edx
@@ -839,50 +2569,53 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: shl_16bytes:
+; X86-SSE42-LABEL: shl_16bytes_dwordOff:
; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $32, %esp
+; X86-SSE42-NEXT: subl $44, %esp
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT: movups (%edx), %xmm0
; X86-SSE42-NEXT: movzbl (%ecx), %ecx
; X86-SSE42-NEXT: xorps %xmm1, %xmm1
-; X86-SSE42-NEXT: movups %xmm1, (%esp)
-; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andb $15, %cl
+; X86-SSE42-NEXT: movaps %xmm1, (%esp)
+; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: shlb $2, %cl
+; X86-SSE42-NEXT: andb $12, %cl
; X86-SSE42-NEXT: negb %cl
; X86-SSE42-NEXT: movsbl %cl, %ecx
; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm0
; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $32, %esp
+; X86-SSE42-NEXT: addl $44, %esp
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: shl_16bytes:
+; X86-AVX-LABEL: shl_16bytes_dwordOff:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: subl $32, %esp
+; X86-AVX-NEXT: subl $44, %esp
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: vmovups (%edx), %xmm0
; X86-AVX-NEXT: movzbl (%ecx), %ecx
; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT: vmovups %xmm1, (%esp)
-; X86-AVX-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andb $15, %cl
+; X86-AVX-NEXT: vmovaps %xmm1, (%esp)
+; X86-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: shlb $2, %cl
+; X86-AVX-NEXT: andb $12, %cl
; X86-AVX-NEXT: negb %cl
; X86-AVX-NEXT: movsbl %cl, %ecx
; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm0
; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $32, %esp
+; X86-AVX-NEXT: addl $44, %esp
; X86-AVX-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
- %byteOff = load i128, ptr %byteOff.ptr, align 1
- %bitOff = shl i128 %byteOff, 3
+ %dwordOff = load i128, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i128 %dwordOff, 5
%res = shl i128 %src, %bitOff
store i128 %res, ptr %dst, align 1
ret void
}
+
define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes:
; X64-NO-SHLD-NO-BMI2: # %bb.0:
@@ -960,50 +2693,355 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
;
-; X86-SSE2-LABEL: ashr_16bytes:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebx
-; X86-SSE2-NEXT: pushl %edi
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $32, %esp
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT: movl (%edx), %esi
-; X86-SSE2-NEXT: movl 4(%edx), %edi
-; X86-SSE2-NEXT: movl 8(%edx), %ebx
-; X86-SSE2-NEXT: movl 12(%edx), %edx
-; X86-SSE2-NEXT: movzbl (%ecx), %ecx
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %esi, (%esp)
-; X86-SSE2-NEXT: sarl $31, %edx
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $15, %ecx
-; X86-SSE2-NEXT: movl (%esp,%ecx), %edx
-; X86-SSE2-NEXT: movl 4(%esp,%ecx), %esi
-; X86-SSE2-NEXT: movl 12(%esp,%ecx), %edi
-; X86-SSE2-NEXT: movl 8(%esp,%ecx), %ecx
-; X86-SSE2-NEXT: movl %ecx, 8(%eax)
-; X86-SSE2-NEXT: movl %edi, 12(%eax)
-; X86-SSE2-NEXT: movl %edx, (%eax)
-; X86-SSE2-NEXT: movl %esi, 4(%eax)
-; X86-SSE2-NEXT: addl $32, %esp
-; X86-SSE2-NEXT: popl %esi
-; X86-SSE2-NEXT: popl %edi
-; X86-SSE2-NEXT: popl %ebx
-; X86-SSE2-NEXT: retl
+; X86-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes:
+; X86-NO-SHLD-NO-BMI2: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 8(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 12(%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: movb (%eax), %ah
+; X86-NO-SHLD-NO-BMI2-NEXT: movb %ah, %al
+; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: andb $12, %ah
+; X86-NO-SHLD-NO-BMI2-NEXT: movzbl %ah, %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 20(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 24(%esp,%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 16(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 28(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: sarl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebx, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebp, 8(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, (%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT: addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: retl
;
-; X86-SSE42-LABEL: ashr_16bytes:
-; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: pushl %ebx
-; X86-SSE42-NEXT: pushl %edi
-; X86-SSE42-NEXT: pushl %esi
-; X86-SSE42-NEXT: subl $32, %esp
+; X86-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes:
+; X86-HAVE-SHLD-NO-BMI2: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 12(%edx), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movb (%ecx), %ch
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movb %ch, %cl
+; X86-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl $31, %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: andb $12, %ch
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl %ch, %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 8(%esp,%ebx), %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%esp,%ebx), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebp, %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %ebp, %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl %cl, %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
+; X86-NO-SHLD-HAVE-BMI2: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: subl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl (%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, (%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: andb $12, %bl
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %dl
+; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%ebx,%ebx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, (%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%esi), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %eax, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %esi, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 8(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 4(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: addl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
+; X86-HAVE-SHLD-HAVE-BMI2: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 12(%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarl $31, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: andb $12, %al
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl %al, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%eax), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esp,%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %ebx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %eax, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = ashr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax
+; X64-NO-SHLD-NO-BMI2-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rdi,%rdi), %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl
+; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: sarq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al
+; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: sarq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, (%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil
+; X64-NO-SHLD-HAVE-BMI2-NEXT: leaq (%rax,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rdi, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: sarxq %rcx, %rax, %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: sarq $63, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: sarxq %rcx, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
+;
+; X86-SSE2-LABEL: ashr_16bytes_dwordOff:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebx
+; X86-SSE2-NEXT: pushl %edi
+; X86-SSE2-NEXT: pushl %esi
+; X86-SSE2-NEXT: subl $32, %esp
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT: movl (%edx), %esi
+; X86-SSE2-NEXT: movl 4(%edx), %edi
+; X86-SSE2-NEXT: movl 8(%edx), %ebx
+; X86-SSE2-NEXT: movl 12(%edx), %edx
+; X86-SSE2-NEXT: movzbl (%ecx), %ecx
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %esi, (%esp)
+; X86-SSE2-NEXT: sarl $31, %edx
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: andl $3, %ecx
+; X86-SSE2-NEXT: movl (%esp,%ecx,4), %edx
+; X86-SSE2-NEXT: movl 4(%esp,%ecx,4), %esi
+; X86-SSE2-NEXT: movl 12(%esp,%ecx,4), %edi
+; X86-SSE2-NEXT: movl 8(%esp,%ecx,4), %ecx
+; X86-SSE2-NEXT: movl %ecx, 8(%eax)
+; X86-SSE2-NEXT: movl %edi, 12(%eax)
+; X86-SSE2-NEXT: movl %edx, (%eax)
+; X86-SSE2-NEXT: movl %esi, 4(%eax)
+; X86-SSE2-NEXT: addl $32, %esp
+; X86-SSE2-NEXT: popl %esi
+; X86-SSE2-NEXT: popl %edi
+; X86-SSE2-NEXT: popl %ebx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: ashr_16bytes_dwordOff:
+; X86-SSE42: # %bb.0:
+; X86-SSE42-NEXT: pushl %ebx
+; X86-SSE42-NEXT: pushl %edi
+; X86-SSE42-NEXT: pushl %esi
+; X86-SSE42-NEXT: subl $32, %esp
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -1021,8 +3059,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andl $15, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
+; X86-SSE42-NEXT: andl $3, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0
; X86-SSE42-NEXT: movups %xmm0, (%eax)
; X86-SSE42-NEXT: addl $32, %esp
; X86-SSE42-NEXT: popl %esi
@@ -1030,7 +3068,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: popl %ebx
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: ashr_16bytes:
+; X86-AVX-LABEL: ashr_16bytes_dwordOff:
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: pushl %ebx
; X86-AVX-NEXT: pushl %edi
@@ -1053,8 +3091,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andl $15, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
+; X86-AVX-NEXT: andl $3, %ecx
+; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0
; X86-AVX-NEXT: vmovups %xmm0, (%eax)
; X86-AVX-NEXT: addl $32, %esp
; X86-AVX-NEXT: popl %esi
@@ -1062,84 +3100,2562 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: popl %ebx
; X86-AVX-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
- %byteOff = load i128, ptr %byteOff.ptr, align 1
- %bitOff = shl i128 %byteOff, 3
+ %dwordOff = load i128, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i128 %dwordOff, 5
%res = ashr i128 %src, %bitOff
store i128 %res, ptr %dst, align 1
ret void
}
define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: lshr_32bytes:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movq (%rdi), %rax
-; X64-SSE2-NEXT: movq 8(%rdi), %rcx
-; X64-SSE2-NEXT: movq 16(%rdi), %r8
-; X64-SSE2-NEXT: movq 24(%rdi), %rdi
-; X64-SSE2-NEXT: movzbl (%rsi), %esi
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $31, %esi
-; X64-SSE2-NEXT: movq -64(%rsp,%rsi), %rax
-; X64-SSE2-NEXT: movq -56(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi
-; X64-SSE2-NEXT: movq %rsi, 16(%rdx)
-; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT: movq %rax, (%rdx)
-; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
-; X64-SSE2-NEXT: retq
+; FALLBACK0-LABEL: lshr_32bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rcx
+; FALLBACK0-NEXT: movq 8(%rdi), %r8
+; FALLBACK0-NEXT: movq 16(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: leal (,%rsi,8), %eax
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: andb $24, %sil
+; FALLBACK0-NEXT: movzbl %sil, %r9d
+; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi
+; FALLBACK0-NEXT: movq %rdi, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: orq %r11, %r8
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: addq %rdi, %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rdi
+; FALLBACK0-NEXT: orq %r10, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK0-NEXT: leaq (%r9,%r9), %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %rbx, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r9
+; FALLBACK0-NEXT: movq %r9, 24(%rdx)
+; FALLBACK0-NEXT: movq %r10, 16(%rdx)
+; FALLBACK0-NEXT: movq %rdi, (%rdx)
+; FALLBACK0-NEXT: movq %r8, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
;
-; X64-SSE42-LABEL: lshr_32bytes:
-; X64-SSE42: # %bb.0:
-; X64-SSE42-NEXT: movups (%rdi), %xmm0
-; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
-; X64-SSE42-NEXT: movzbl (%rsi), %eax
-; X64-SSE42-NEXT: xorps %xmm2, %xmm2
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $31, %eax
-; X64-SSE42-NEXT: movups -64(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT: movups -48(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
-; X64-SSE42-NEXT: movups %xmm0, (%rdx)
-; X64-SSE42-NEXT: retq
+; FALLBACK1-LABEL: lshr_32bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq (%rdi), %rax
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: andb $24, %sil
+; FALLBACK1-NEXT: movzbl %sil, %eax
+; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8
+; FALLBACK1-NEXT: movq %r8, %r9
+; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax
+; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: shrq %cl, %rax
+; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rax, 24(%rdx)
+; FALLBACK1-NEXT: movq %rdi, (%rdx)
+; FALLBACK1-NEXT: movq %r9, 8(%rdx)
+; FALLBACK1-NEXT: retq
;
-; X64-AVX-LABEL: lshr_32bytes:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX-NEXT: movzbl (%rsi), %eax
-; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: andl $31, %eax
-; X64-AVX-NEXT: vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX-NEXT: vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT: vzeroupper
-; X64-AVX-NEXT: retq
+; FALLBACK2-LABEL: lshr_32bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: leal (,%rsi,8), %eax
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: andb $24, %sil
+; FALLBACK2-NEXT: movzbl %sil, %ecx
+; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi
+; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi
+; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9
+; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: addq %rdi, %rdi
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r8, %rdi
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r9, %rsi
+; FALLBACK2-NEXT: addq %rcx, %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, 24(%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rsi, (%rdx)
+; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: lshr_32bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq (%rdi), %rax
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: andb $24, %sil
+; FALLBACK3-NEXT: movzbl %sil, %eax
+; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8
+; FALLBACK3-NEXT: movq %r8, %r9
+; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax
+; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rax, 24(%rdx)
+; FALLBACK3-NEXT: movq %rdi, (%rdx)
+; FALLBACK3-NEXT: movq %r9, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: lshr_32bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movzbl (%rsi), %ecx
+; FALLBACK4-NEXT: leal (,%rcx,8), %eax
+; FALLBACK4-NEXT: xorps %xmm2, %xmm2
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: andb $24, %cl
+; FALLBACK4-NEXT: movzbl %cl, %r9d
+; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rdi
+; FALLBACK4-NEXT: orq %r10, %rdi
+; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK4-NEXT: movq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r11, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r8, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r9
+; FALLBACK4-NEXT: movq %r9, 24(%rdx)
+; FALLBACK4-NEXT: movq %r10, 8(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT: movq %rdi, (%rdx)
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: lshr_32bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movzbl (%rsi), %eax
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: xorps %xmm2, %xmm2
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: andb $24, %al
+; FALLBACK5-NEXT: movzbl %al, %eax
+; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: movq %rdi, %r8
+; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK5-NEXT: movq %rax, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: shrq %cl, %rsi
+; FALLBACK5-NEXT: movq %r10, 8(%rdx)
+; FALLBACK5-NEXT: movq %r8, 16(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: lshr_32bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movzbl (%rsi), %ecx
+; FALLBACK6-NEXT: leal (,%rcx,8), %eax
+; FALLBACK6-NEXT: xorps %xmm2, %xmm2
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: andb $24, %cl
+; FALLBACK6-NEXT: movzbl %cl, %ecx
+; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK6-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT: orq %r9, %rcx
+; FALLBACK6-NEXT: addq %r8, %r8
+; FALLBACK6-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, (%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: lshr_32bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movzbl (%rsi), %eax
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: xorps %xmm2, %xmm2
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: andb $24, %al
+; FALLBACK7-NEXT: movzbl %al, %eax
+; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: movq %rdi, %r8
+; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK7-NEXT: movq %rax, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK7-NEXT: movq %r10, 8(%rdx)
+; FALLBACK7-NEXT: movq %r8, 16(%rdx)
+; FALLBACK7-NEXT: movq %rax, 24(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: retq
;
-; X86-SSE2-LABEL: lshr_32bytes:
+; FALLBACK8-LABEL: lshr_32bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: movzbl (%rsi), %ecx
+; FALLBACK8-NEXT: leal (,%rcx,8), %eax
+; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: andb $24, %cl
+; FALLBACK8-NEXT: movzbl %cl, %r9d
+; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rdi
+; FALLBACK8-NEXT: orq %r10, %rdi
+; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r11, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r8, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, 24(%rdx)
+; FALLBACK8-NEXT: movq %r10, 8(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT: movq %rdi, (%rdx)
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: lshr_32bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: movzbl (%rsi), %eax
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: andb $24, %al
+; FALLBACK9-NEXT: movzbl %al, %eax
+; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: movq %rdi, %r8
+; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK9-NEXT: movq %rax, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: shrq %cl, %rsi
+; FALLBACK9-NEXT: movq %r10, 8(%rdx)
+; FALLBACK9-NEXT: movq %r8, 16(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: lshr_32bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: movzbl (%rsi), %ecx
+; FALLBACK10-NEXT: leal (,%rcx,8), %eax
+; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: andb $24, %cl
+; FALLBACK10-NEXT: movzbl %cl, %ecx
+; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK10-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT: orq %r9, %rcx
+; FALLBACK10-NEXT: addq %r8, %r8
+; FALLBACK10-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, (%rdx)
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: lshr_32bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: movzbl (%rsi), %eax
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: andb $24, %al
+; FALLBACK11-NEXT: movzbl %al, %eax
+; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: movq %rdi, %r8
+; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK11-NEXT: movq %rax, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK11-NEXT: movq %r10, 8(%rdx)
+; FALLBACK11-NEXT: movq %r8, 16(%rdx)
+; FALLBACK11-NEXT: movq %rax, 24(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: lshr_32bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT: movzbl (%rsi), %ecx
+; FALLBACK12-NEXT: leal (,%rcx,8), %eax
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: andb $24, %cl
+; FALLBACK12-NEXT: movzbl %cl, %r9d
+; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rdi
+; FALLBACK12-NEXT: orq %r10, %rdi
+; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r11, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r8, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, 24(%rdx)
+; FALLBACK12-NEXT: movq %r10, 8(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT: movq %rdi, (%rdx)
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: lshr_32bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT: movzbl (%rsi), %eax
+; FALLBACK13-NEXT: leal (,%rax,8), %ecx
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: andb $24, %al
+; FALLBACK13-NEXT: movzbl %al, %eax
+; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK13-NEXT: movq %rdi, %r8
+; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK13-NEXT: movq %rax, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: shrq %cl, %rsi
+; FALLBACK13-NEXT: movq %r10, 8(%rdx)
+; FALLBACK13-NEXT: movq %r8, 16(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: lshr_32bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT: movzbl (%rsi), %ecx
+; FALLBACK14-NEXT: leal (,%rcx,8), %eax
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: andb $24, %cl
+; FALLBACK14-NEXT: movzbl %cl, %ecx
+; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK14-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: addq %rcx, %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT: orq %r9, %rcx
+; FALLBACK14-NEXT: addq %r8, %r8
+; FALLBACK14-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, (%rdx)
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: lshr_32bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT: movzbl (%rsi), %eax
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: andb $24, %al
+; FALLBACK15-NEXT: movzbl %al, %eax
+; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: movq %rdi, %r8
+; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK15-NEXT: movq %rax, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK15-NEXT: movq %r10, 8(%rdx)
+; FALLBACK15-NEXT: movq %r8, 16(%rdx)
+; FALLBACK15-NEXT: movq %rax, 24(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: lshr_32bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $108, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK16-NEXT: movl (%ebp), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%ebp), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%ebp), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%ebp), %edi
+; FALLBACK16-NEXT: movl 16(%ebp), %ebx
+; FALLBACK16-NEXT: movb (%eax), %ah
+; FALLBACK16-NEXT: movl 20(%ebp), %esi
+; FALLBACK16-NEXT: movl 24(%ebp), %ecx
+; FALLBACK16-NEXT: movl 28(%ebp), %ebp
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movb %ah, %dh
+; FALLBACK16-NEXT: shlb $3, %dh
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $28, %ah
+; FALLBACK16-NEXT: movzbl %ah, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi
+; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax
+; FALLBACK16-NEXT: movl %eax, %ebx
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movb %dh, %dl
+; FALLBACK16-NEXT: notb %dl
+; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %ebx, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: movl %eax, %ebx
+; FALLBACK16-NEXT: addl %eax, %ebx
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %esi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp
+; FALLBACK16-NEXT: movl %ebp, %esi
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: movl 48(%esp,%eax), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%eax,%eax), %ebx
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %esi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: addl %ebp, %ebp
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %edi, %ebp
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl 52(%esp,%eax), %edi
+; FALLBACK16-NEXT: movl %edi, %ebx
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl 56(%esp,%eax), %esi
+; FALLBACK16-NEXT: leal (%esi,%esi), %eax
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %ebx, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: addl %edi, %edi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %ebx, %edi
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: movl %esi, %eax
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl 60(%esp,%ecx), %ebx
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %eax, %esi
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl %ebx, 28(%eax)
+; FALLBACK16-NEXT: movl %esi, 24(%eax)
+; FALLBACK16-NEXT: movl %edi, 16(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 20(%eax)
+; FALLBACK16-NEXT: movl %ebp, 8(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 12(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, (%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 4(%eax)
+; FALLBACK16-NEXT: addl $108, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: lshr_32bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $92, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl (%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%ebp), %esi
+; FALLBACK17-NEXT: movl 12(%ebp), %edi
+; FALLBACK17-NEXT: movl 16(%ebp), %ebx
+; FALLBACK17-NEXT: movb (%ecx), %ch
+; FALLBACK17-NEXT: movl 20(%ebp), %edx
+; FALLBACK17-NEXT: movl 24(%ebp), %eax
+; FALLBACK17-NEXT: movl 28(%ebp), %ebp
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movb %ch, %cl
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: andb $28, %ch
+; FALLBACK17-NEXT: movzbl %ch, %ebp
+; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx
+; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %esi
+; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl %edx, 24(%ebp)
+; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT: shrl %cl, %eax
+; FALLBACK17-NEXT: movl %eax, 28(%ebp)
+; FALLBACK17-NEXT: movl %ebx, 16(%ebp)
+; FALLBACK17-NEXT: movl %edi, 20(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 8(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 12(%ebp)
+; FALLBACK17-NEXT: movl %esi, (%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 4(%ebp)
+; FALLBACK17-NEXT: addl $92, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: lshr_32bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $108, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%eax), %esi
+; FALLBACK18-NEXT: movl 12(%eax), %edi
+; FALLBACK18-NEXT: movl 16(%eax), %ebp
+; FALLBACK18-NEXT: movzbl (%ebx), %ebx
+; FALLBACK18-NEXT: movl 20(%eax), %edx
+; FALLBACK18-NEXT: movl 24(%eax), %ecx
+; FALLBACK18-NEXT: movl 28(%eax), %eax
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, %eax
+; FALLBACK18-NEXT: shlb $3, %al
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: andb $28, %bl
+; FALLBACK18-NEXT: movzbl %bl, %edi
+; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %eax, %esi, %edx
+; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl %eax, %edx
+; FALLBACK18-NEXT: movl %eax, %ebx
+; FALLBACK18-NEXT: notb %dl
+; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp
+; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl %ebx, %ecx
+; FALLBACK18-NEXT: shrxl %ebx, 32(%esp,%edi), %ebx
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %eax
+; FALLBACK18-NEXT: orl %ebx, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 48(%esp,%edi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %ebx
+; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi
+; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp
+; FALLBACK18-NEXT: movl %ecx, %eax
+; FALLBACK18-NEXT: shrxl %ecx, %ebp, %ebx
+; FALLBACK18-NEXT: orl %ebx, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %eax, %ebx
+; FALLBACK18-NEXT: addl %ebp, %ebp
+; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: orl %ecx, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp
+; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi
+; FALLBACK18-NEXT: orl %esi, %ecx
+; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: addl %eax, %eax
+; FALLBACK18-NEXT: shlxl %edx, %eax, %esi
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebx
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl %ebx, 28(%eax)
+; FALLBACK18-NEXT: movl %edi, 24(%eax)
+; FALLBACK18-NEXT: movl %esi, 16(%eax)
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, (%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $108, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: lshr_32bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $92, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%ecx), %esi
+; FALLBACK19-NEXT: movl 12(%ecx), %edi
+; FALLBACK19-NEXT: movl 16(%ecx), %ebp
+; FALLBACK19-NEXT: movzbl (%ebx), %ebx
+; FALLBACK19-NEXT: movl 20(%ecx), %edx
+; FALLBACK19-NEXT: movl 24(%ecx), %eax
+; FALLBACK19-NEXT: movl 28(%ecx), %ecx
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: andb $28, %bl
+; FALLBACK19-NEXT: movzbl %bl, %ebp
+; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %esi, %eax
+; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx
+; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx
+; FALLBACK19-NEXT: movl %edx, %esi
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx
+; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi
+; FALLBACK19-NEXT: shrdl %cl, %edi, %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl %eax, 24(%ebp)
+; FALLBACK19-NEXT: shrxl %ecx, %edi, %eax
+; FALLBACK19-NEXT: movl %eax, 28(%ebp)
+; FALLBACK19-NEXT: movl %ebx, 16(%ebp)
+; FALLBACK19-NEXT: movl %esi, 20(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 8(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 12(%ebp)
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: movl %edx, (%ebp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 4(%ebp)
+; FALLBACK19-NEXT: addl $92, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: lshr_32bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $108, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movzbl (%eax), %ecx
+; FALLBACK20-NEXT: movl %ecx, %eax
+; FALLBACK20-NEXT: shlb $3, %al
+; FALLBACK20-NEXT: xorps %xmm2, %xmm2
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $28, %cl
+; FALLBACK20-NEXT: movzbl %cl, %ecx
+; FALLBACK20-NEXT: movl 32(%esp,%ecx), %esi
+; FALLBACK20-NEXT: movl 36(%esp,%ecx), %ebx
+; FALLBACK20-NEXT: movl %ecx, %edi
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %esi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %esi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK20-NEXT: movl %esi, %ebx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %ebp, %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %ebx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK20-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl %ebx, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebp, %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %edi, %ebp
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl 60(%esp,%ecx), %ebx
+; FALLBACK20-NEXT: leal (%ebx,%ebx), %edi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %ebx, 28(%eax)
+; FALLBACK20-NEXT: movl %esi, 4(%eax)
+; FALLBACK20-NEXT: movl %edi, 24(%eax)
+; FALLBACK20-NEXT: movl %ebp, 16(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, (%eax)
+; FALLBACK20-NEXT: addl $108, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: lshr_32bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $108, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movups (%ecx), %xmm0
+; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT: movzbl (%eax), %eax
+; FALLBACK21-NEXT: movl %eax, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: xorps %xmm2, %xmm2
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: andb $28, %al
+; FALLBACK21-NEXT: movzbl %al, %ebp
+; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl %edi, %esi
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %esi, 4(%ebp)
+; FALLBACK21-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT: shrl %cl, %eax
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %edx, (%ebp)
+; FALLBACK21-NEXT: addl $108, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: lshr_32bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $108, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %edx
+; FALLBACK22-NEXT: shlb $3, %dl
+; FALLBACK22-NEXT: xorps %xmm2, %xmm2
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: andb $28, %cl
+; FALLBACK22-NEXT: movzbl %cl, %edi
+; FALLBACK22-NEXT: shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %edx, %eax
+; FALLBACK22-NEXT: notb %al
+; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK22-NEXT: movl %eax, %ebp
+; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK22-NEXT: shrxl %edx, %ecx, %ebx
+; FALLBACK22-NEXT: orl %ebx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK22-NEXT: movl 40(%esp,%edi), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %eax, %ebx
+; FALLBACK22-NEXT: orl %ebx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK22-NEXT: shlxl %ebp, %ebx, %eax
+; FALLBACK22-NEXT: movl %ebp, %ecx
+; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK22-NEXT: shrxl %edx, %ebx, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %ebx, %ebx
+; FALLBACK22-NEXT: shlxl %ecx, %ebx, %ebx
+; FALLBACK22-NEXT: orl %ebp, %ebx
+; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %eax
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: movl %ecx, %edx
+; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK22-NEXT: orl %ebp, %edi
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %esi, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT: movl %eax, 28(%edx)
+; FALLBACK22-NEXT: movl %ecx, 4(%edx)
+; FALLBACK22-NEXT: movl %edi, 24(%edx)
+; FALLBACK22-NEXT: movl %ebx, 16(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 20(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 8(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 12(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, (%edx)
+; FALLBACK22-NEXT: addl $108, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: lshr_32bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $108, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movups (%ecx), %xmm0
+; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT: movzbl (%eax), %eax
+; FALLBACK23-NEXT: movl %eax, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: xorps %xmm2, %xmm2
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: andb $28, %al
+; FALLBACK23-NEXT: movzbl %al, %ebx
+; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, %edi
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl %ebx, 4(%eax)
+; FALLBACK23-NEXT: movl %ebp, 24(%eax)
+; FALLBACK23-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK23-NEXT: movl %ebx, 28(%eax)
+; FALLBACK23-NEXT: movl %esi, 16(%eax)
+; FALLBACK23-NEXT: movl %edi, 20(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: movl %esi, 8(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: movl %esi, 12(%eax)
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, (%eax)
+; FALLBACK23-NEXT: addl $108, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: lshr_32bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $108, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: movzbl (%eax), %ecx
+; FALLBACK24-NEXT: movl %ecx, %eax
+; FALLBACK24-NEXT: shlb $3, %al
+; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $28, %cl
+; FALLBACK24-NEXT: movzbl %cl, %ecx
+; FALLBACK24-NEXT: movl 32(%esp,%ecx), %esi
+; FALLBACK24-NEXT: movl 36(%esp,%ecx), %ebx
+; FALLBACK24-NEXT: movl %ecx, %edi
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %esi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %esi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK24-NEXT: movl %esi, %ebx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %ebp, %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %ebx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK24-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl %ebx, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebp, %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %edi, %ebp
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl 60(%esp,%ecx), %ebx
+; FALLBACK24-NEXT: leal (%ebx,%ebx), %edi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %ebx, 28(%eax)
+; FALLBACK24-NEXT: movl %esi, 4(%eax)
+; FALLBACK24-NEXT: movl %edi, 24(%eax)
+; FALLBACK24-NEXT: movl %ebp, 16(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, (%eax)
+; FALLBACK24-NEXT: addl $108, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: lshr_32bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $108, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT: movzbl (%eax), %eax
+; FALLBACK25-NEXT: movl %eax, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: andb $28, %al
+; FALLBACK25-NEXT: movzbl %al, %ebp
+; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl %edi, %esi
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %esi, 4(%ebp)
+; FALLBACK25-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT: shrl %cl, %eax
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %edx, (%ebp)
+; FALLBACK25-NEXT: addl $108, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: lshr_32bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $108, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %edx
+; FALLBACK26-NEXT: shlb $3, %dl
+; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: andb $28, %cl
+; FALLBACK26-NEXT: movzbl %cl, %edi
+; FALLBACK26-NEXT: shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %edx, %eax
+; FALLBACK26-NEXT: notb %al
+; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK26-NEXT: orl %ecx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK26-NEXT: movl %eax, %ebp
+; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK26-NEXT: shrxl %edx, %ecx, %ebx
+; FALLBACK26-NEXT: orl %ebx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK26-NEXT: movl 40(%esp,%edi), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %eax, %ebx
+; FALLBACK26-NEXT: orl %ebx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK26-NEXT: shlxl %ebp, %ebx, %eax
+; FALLBACK26-NEXT: movl %ebp, %ecx
+; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK26-NEXT: shrxl %edx, %ebx, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %ebx, %ebx
+; FALLBACK26-NEXT: shlxl %ecx, %ebx, %ebx
+; FALLBACK26-NEXT: orl %ebp, %ebx
+; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %eax
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: movl %ecx, %edx
+; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK26-NEXT: orl %ebp, %edi
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %esi, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT: movl %eax, 28(%edx)
+; FALLBACK26-NEXT: movl %ecx, 4(%edx)
+; FALLBACK26-NEXT: movl %edi, 24(%edx)
+; FALLBACK26-NEXT: movl %ebx, 16(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 20(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 8(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 12(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, (%edx)
+; FALLBACK26-NEXT: addl $108, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: lshr_32bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $108, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT: movzbl (%eax), %eax
+; FALLBACK27-NEXT: movl %eax, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: andb $28, %al
+; FALLBACK27-NEXT: movzbl %al, %ebx
+; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, %edi
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl %ebx, 4(%eax)
+; FALLBACK27-NEXT: movl %ebp, 24(%eax)
+; FALLBACK27-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK27-NEXT: movl %ebx, 28(%eax)
+; FALLBACK27-NEXT: movl %esi, 16(%eax)
+; FALLBACK27-NEXT: movl %edi, 20(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: movl %esi, 8(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: movl %esi, 12(%eax)
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, (%eax)
+; FALLBACK27-NEXT: addl $108, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: lshr_32bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $108, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT: movzbl (%eax), %ecx
+; FALLBACK28-NEXT: movl %ecx, %eax
+; FALLBACK28-NEXT: shlb $3, %al
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $28, %cl
+; FALLBACK28-NEXT: movzbl %cl, %ecx
+; FALLBACK28-NEXT: movl 32(%esp,%ecx), %esi
+; FALLBACK28-NEXT: movl 36(%esp,%ecx), %ebx
+; FALLBACK28-NEXT: movl %ecx, %edi
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %esi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %esi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK28-NEXT: movl %esi, %ebx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %ebp, %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %ebx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK28-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl %ebx, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebp, %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %edi, %ebp
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl 60(%esp,%ecx), %ebx
+; FALLBACK28-NEXT: leal (%ebx,%ebx), %edi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %ebx, 28(%eax)
+; FALLBACK28-NEXT: movl %esi, 4(%eax)
+; FALLBACK28-NEXT: movl %edi, 24(%eax)
+; FALLBACK28-NEXT: movl %ebp, 16(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, (%eax)
+; FALLBACK28-NEXT: addl $108, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: lshr_32bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $108, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK29-NEXT: movzbl (%eax), %eax
+; FALLBACK29-NEXT: movl %eax, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: andb $28, %al
+; FALLBACK29-NEXT: movzbl %al, %ebp
+; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl %edi, %esi
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %esi, 4(%ebp)
+; FALLBACK29-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT: shrl %cl, %eax
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %edx, (%ebp)
+; FALLBACK29-NEXT: addl $108, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: lshr_32bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $108, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %edx
+; FALLBACK30-NEXT: shlb $3, %dl
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: andb $28, %cl
+; FALLBACK30-NEXT: movzbl %cl, %edi
+; FALLBACK30-NEXT: shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %edx, %eax
+; FALLBACK30-NEXT: notb %al
+; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK30-NEXT: orl %ecx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK30-NEXT: movl %eax, %ebp
+; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK30-NEXT: shrxl %edx, %ecx, %ebx
+; FALLBACK30-NEXT: orl %ebx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK30-NEXT: movl 40(%esp,%edi), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %eax, %ebx
+; FALLBACK30-NEXT: orl %ebx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK30-NEXT: shlxl %ebp, %ebx, %eax
+; FALLBACK30-NEXT: movl %ebp, %ecx
+; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK30-NEXT: shrxl %edx, %ebx, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %ebx, %ebx
+; FALLBACK30-NEXT: shlxl %ecx, %ebx, %ebx
+; FALLBACK30-NEXT: orl %ebp, %ebx
+; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %eax
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: movl %ecx, %edx
+; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK30-NEXT: orl %ebp, %edi
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %esi, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT: movl %eax, 28(%edx)
+; FALLBACK30-NEXT: movl %ecx, 4(%edx)
+; FALLBACK30-NEXT: movl %edi, 24(%edx)
+; FALLBACK30-NEXT: movl %ebx, 16(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 20(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 8(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 12(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, (%edx)
+; FALLBACK30-NEXT: addl $108, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: lshr_32bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $108, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK31-NEXT: movzbl (%eax), %eax
+; FALLBACK31-NEXT: movl %eax, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: andb $28, %al
+; FALLBACK31-NEXT: movzbl %al, %ebx
+; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, %edi
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl %ebx, 4(%eax)
+; FALLBACK31-NEXT: movl %ebp, 24(%eax)
+; FALLBACK31-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK31-NEXT: movl %ebx, 28(%eax)
+; FALLBACK31-NEXT: movl %esi, 16(%eax)
+; FALLBACK31-NEXT: movl %edi, 20(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: movl %esi, 8(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: movl %esi, 12(%eax)
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, (%eax)
+; FALLBACK31-NEXT: addl $108, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; FALLBACK0-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rcx
+; FALLBACK0-NEXT: movq 8(%rdi), %r8
+; FALLBACK0-NEXT: movq 16(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: movl %esi, %eax
+; FALLBACK0-NEXT: shlb $5, %al
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: andb $6, %sil
+; FALLBACK0-NEXT: movzbl %sil, %r9d
+; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi
+; FALLBACK0-NEXT: movq %rdi, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: orq %r11, %r8
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: addq %rdi, %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rdi
+; FALLBACK0-NEXT: orq %r10, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK0-NEXT: leaq (%r9,%r9), %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %rbx, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r9
+; FALLBACK0-NEXT: movq %r9, 24(%rdx)
+; FALLBACK0-NEXT: movq %r10, 16(%rdx)
+; FALLBACK0-NEXT: movq %rdi, (%rdx)
+; FALLBACK0-NEXT: movq %r8, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq (%rdi), %rax
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: movl %esi, %ecx
+; FALLBACK1-NEXT: shlb $5, %cl
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: andb $6, %sil
+; FALLBACK1-NEXT: movzbl %sil, %eax
+; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi
+; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi
+; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8
+; FALLBACK1-NEXT: movq %r8, %r9
+; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax
+; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK1-NEXT: shrq %cl, %rax
+; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rax, 24(%rdx)
+; FALLBACK1-NEXT: movq %rdi, (%rdx)
+; FALLBACK1-NEXT: movq %r9, 8(%rdx)
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: movl %esi, %eax
+; FALLBACK2-NEXT: shlb $5, %al
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: andb $6, %sil
+; FALLBACK2-NEXT: movzbl %sil, %ecx
+; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi
+; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi
+; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9
+; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: addq %rdi, %rdi
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r8, %rdi
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r9, %rsi
+; FALLBACK2-NEXT: addq %rcx, %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, 24(%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rsi, (%rdx)
+; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq (%rdi), %rax
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: movl %esi, %ecx
+; FALLBACK3-NEXT: shlb $5, %cl
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: andb $6, %sil
+; FALLBACK3-NEXT: movzbl %sil, %eax
+; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi
+; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi
+; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8
+; FALLBACK3-NEXT: movq %r8, %r9
+; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax
+; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rax, 24(%rdx)
+; FALLBACK3-NEXT: movq %rdi, (%rdx)
+; FALLBACK3-NEXT: movq %r9, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movzbl (%rsi), %ecx
+; FALLBACK4-NEXT: movl %ecx, %eax
+; FALLBACK4-NEXT: shlb $5, %al
+; FALLBACK4-NEXT: xorps %xmm2, %xmm2
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: andb $6, %cl
+; FALLBACK4-NEXT: movzbl %cl, %r9d
+; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rdi
+; FALLBACK4-NEXT: orq %r10, %rdi
+; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10
+; FALLBACK4-NEXT: movq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r11, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r8, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r9
+; FALLBACK4-NEXT: movq %r9, 24(%rdx)
+; FALLBACK4-NEXT: movq %r10, 8(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT: movq %rdi, (%rdx)
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movzbl (%rsi), %eax
+; FALLBACK5-NEXT: movl %eax, %ecx
+; FALLBACK5-NEXT: shlb $5, %cl
+; FALLBACK5-NEXT: xorps %xmm2, %xmm2
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: andb $6, %al
+; FALLBACK5-NEXT: movzbl %al, %eax
+; FALLBACK5-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK5-NEXT: movq %rdi, %r8
+; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK5-NEXT: movq %rax, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK5-NEXT: shrq %cl, %rsi
+; FALLBACK5-NEXT: movq %r10, 8(%rdx)
+; FALLBACK5-NEXT: movq %r8, 16(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movzbl (%rsi), %ecx
+; FALLBACK6-NEXT: movl %ecx, %eax
+; FALLBACK6-NEXT: shlb $5, %al
+; FALLBACK6-NEXT: xorps %xmm2, %xmm2
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: andb $6, %cl
+; FALLBACK6-NEXT: movzbl %cl, %ecx
+; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8
+; FALLBACK6-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT: orq %r9, %rcx
+; FALLBACK6-NEXT: addq %r8, %r8
+; FALLBACK6-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, (%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movzbl (%rsi), %eax
+; FALLBACK7-NEXT: movl %eax, %ecx
+; FALLBACK7-NEXT: shlb $5, %cl
+; FALLBACK7-NEXT: xorps %xmm2, %xmm2
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: andb $6, %al
+; FALLBACK7-NEXT: movzbl %al, %eax
+; FALLBACK7-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK7-NEXT: movq %rdi, %r8
+; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK7-NEXT: movq %rax, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK7-NEXT: movq %r10, 8(%rdx)
+; FALLBACK7-NEXT: movq %r8, 16(%rdx)
+; FALLBACK7-NEXT: movq %rax, 24(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: movzbl (%rsi), %ecx
+; FALLBACK8-NEXT: movl %ecx, %eax
+; FALLBACK8-NEXT: shlb $5, %al
+; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: andb $6, %cl
+; FALLBACK8-NEXT: movzbl %cl, %r9d
+; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rdi
+; FALLBACK8-NEXT: orq %r10, %rdi
+; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10
+; FALLBACK8-NEXT: movq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r11, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r8, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, 24(%rdx)
+; FALLBACK8-NEXT: movq %r10, 8(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT: movq %rdi, (%rdx)
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: movzbl (%rsi), %eax
+; FALLBACK9-NEXT: movl %eax, %ecx
+; FALLBACK9-NEXT: shlb $5, %cl
+; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: andb $6, %al
+; FALLBACK9-NEXT: movzbl %al, %eax
+; FALLBACK9-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK9-NEXT: movq %rdi, %r8
+; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK9-NEXT: movq %rax, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK9-NEXT: shrq %cl, %rsi
+; FALLBACK9-NEXT: movq %r10, 8(%rdx)
+; FALLBACK9-NEXT: movq %r8, 16(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: movzbl (%rsi), %ecx
+; FALLBACK10-NEXT: movl %ecx, %eax
+; FALLBACK10-NEXT: shlb $5, %al
+; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: andb $6, %cl
+; FALLBACK10-NEXT: movzbl %cl, %ecx
+; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8
+; FALLBACK10-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT: orq %r9, %rcx
+; FALLBACK10-NEXT: addq %r8, %r8
+; FALLBACK10-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, (%rdx)
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: movzbl (%rsi), %eax
+; FALLBACK11-NEXT: movl %eax, %ecx
+; FALLBACK11-NEXT: shlb $5, %cl
+; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: andb $6, %al
+; FALLBACK11-NEXT: movzbl %al, %eax
+; FALLBACK11-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK11-NEXT: movq %rdi, %r8
+; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK11-NEXT: movq %rax, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK11-NEXT: movq %r10, 8(%rdx)
+; FALLBACK11-NEXT: movq %r8, 16(%rdx)
+; FALLBACK11-NEXT: movq %rax, 24(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT: movzbl (%rsi), %ecx
+; FALLBACK12-NEXT: movl %ecx, %eax
+; FALLBACK12-NEXT: shlb $5, %al
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: andb $6, %cl
+; FALLBACK12-NEXT: movzbl %cl, %r9d
+; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rdi
+; FALLBACK12-NEXT: orq %r10, %rdi
+; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10
+; FALLBACK12-NEXT: movq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r11, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r8, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, 24(%rdx)
+; FALLBACK12-NEXT: movq %r10, 8(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT: movq %rdi, (%rdx)
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT: movzbl (%rsi), %eax
+; FALLBACK13-NEXT: movl %eax, %ecx
+; FALLBACK13-NEXT: shlb $5, %cl
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: andb $6, %al
+; FALLBACK13-NEXT: movzbl %al, %eax
+; FALLBACK13-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK13-NEXT: movq %rdi, %r8
+; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK13-NEXT: movq %rax, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK13-NEXT: shrq %cl, %rsi
+; FALLBACK13-NEXT: movq %r10, 8(%rdx)
+; FALLBACK13-NEXT: movq %r8, 16(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT: movzbl (%rsi), %ecx
+; FALLBACK14-NEXT: movl %ecx, %eax
+; FALLBACK14-NEXT: shlb $5, %al
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: andb $6, %cl
+; FALLBACK14-NEXT: movzbl %cl, %ecx
+; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8
+; FALLBACK14-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: addq %rcx, %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT: orq %r9, %rcx
+; FALLBACK14-NEXT: addq %r8, %r8
+; FALLBACK14-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, (%rdx)
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT: movzbl (%rsi), %eax
+; FALLBACK15-NEXT: movl %eax, %ecx
+; FALLBACK15-NEXT: shlb $5, %cl
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: andb $6, %al
+; FALLBACK15-NEXT: movzbl %al, %eax
+; FALLBACK15-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK15-NEXT: movq %rdi, %r8
+; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK15-NEXT: movq %rax, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK15-NEXT: movq %r10, 8(%rdx)
+; FALLBACK15-NEXT: movq %r8, 16(%rdx)
+; FALLBACK15-NEXT: movq %rax, 24(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; X86-SSE2-LABEL: lshr_32bytes_dwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $72, %esp
+; X86-SSE2-NEXT: subl $92, %esp
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl (%eax), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 4(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 8(%eax), %esi
; X86-SSE2-NEXT: movl 12(%eax), %edi
; X86-SSE2-NEXT: movl 16(%eax), %ebx
@@ -1148,35 +5664,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl 28(%eax), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movzbl (%eax), %eax
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $31, %eax
-; X86-SSE2-NEXT: movl 8(%esp,%eax), %ecx
+; X86-SSE2-NEXT: andl $7, %eax
+; X86-SSE2-NEXT: movl 16(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 20(%esp,%eax,4), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 12(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 20(%esp,%eax), %esi
-; X86-SSE2-NEXT: movl 16(%esp,%eax), %edi
-; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebx
-; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebp
-; X86-SSE2-NEXT: movl 36(%esp,%eax), %edx
-; X86-SSE2-NEXT: movl 32(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 28(%esp,%eax,4), %esi
+; X86-SSE2-NEXT: movl 24(%esp,%eax,4), %edi
+; X86-SSE2-NEXT: movl 36(%esp,%eax,4), %ebx
+; X86-SSE2-NEXT: movl 32(%esp,%eax,4), %ebp
+; X86-SSE2-NEXT: movl 44(%esp,%eax,4), %edx
+; X86-SSE2-NEXT: movl 40(%esp,%eax,4), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl %ecx, 24(%eax)
; X86-SSE2-NEXT: movl %edx, 28(%eax)
@@ -1186,18 +5697,18 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %esi, 12(%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, (%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $72, %esp
+; X86-SSE2-NEXT: addl $92, %esp
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: lshr_32bytes:
+; X86-SSE42-LABEL: lshr_32bytes_dwordOff:
; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $64, %esp
+; X86-SSE42-NEXT: subl $76, %esp
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -1205,21 +5716,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movups 16(%edx), %xmm1
; X86-SSE42-NEXT: movzbl (%ecx), %ecx
; X86-SSE42-NEXT: xorps %xmm2, %xmm2
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
-; X86-SSE42-NEXT: andl $31, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, (%esp)
+; X86-SSE42-NEXT: andl $7, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0
+; X86-SSE42-NEXT: movups 16(%esp,%ecx,4), %xmm1
; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $64, %esp
+; X86-SSE42-NEXT: addl $76, %esp
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: lshr_32bytes:
+; X86-AVX-LABEL: lshr_32bytes_dwordOff:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: subl $64, %esp
+; X86-AVX-NEXT: subl $76, %esp
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -1228,137 +5739,2812 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: vmovups %ymm0, (%esp)
-; X86-AVX-NEXT: andl $31, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1
+; X86-AVX-NEXT: andl $7, %ecx
+; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0
+; X86-AVX-NEXT: vmovups 16(%esp,%ecx,4), %xmm1
; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $64, %esp
+; X86-AVX-NEXT: addl $76, %esp
; X86-AVX-NEXT: vzeroupper
; X86-AVX-NEXT: retl
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 5
%res = lshr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
-define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: shl_32bytes:
+
+define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: lshr_32bytes_qwordOff:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movq (%rdi), %rax
; X64-SSE2-NEXT: movq 8(%rdi), %rcx
; X64-SSE2-NEXT: movq 16(%rdi), %r8
; X64-SSE2-NEXT: movq 24(%rdi), %rdi
; X64-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andb $31, %sil
-; X64-SSE2-NEXT: negb %sil
-; X64-SSE2-NEXT: movsbq %sil, %rax
-; X64-SSE2-NEXT: movq -32(%rsp,%rax), %rcx
-; X64-SSE2-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-SSE2-NEXT: movq -8(%rsp,%rax), %rdi
-; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rax
-; X64-SSE2-NEXT: movq %rax, 16(%rdx)
+; X64-SSE2-NEXT: andl $3, %esi
+; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT: movq -64(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT: movq -56(%rsp,%rsi,8), %rsi
+; X64-SSE2-NEXT: movq %rsi, 16(%rdx)
; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT: movq %rcx, (%rdx)
-; X64-SSE2-NEXT: movq %rsi, 8(%rdx)
+; X64-SSE2-NEXT: movq %rax, (%rdx)
+; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
; X64-SSE2-NEXT: retq
;
-; X64-SSE42-LABEL: shl_32bytes:
+; X64-SSE42-LABEL: lshr_32bytes_qwordOff:
; X64-SSE42: # %bb.0:
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
; X64-SSE42-NEXT: movzbl (%rsi), %eax
; X64-SSE42-NEXT: xorps %xmm2, %xmm2
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andb $31, %al
-; X64-SSE42-NEXT: negb %al
-; X64-SSE42-NEXT: movsbq %al, %rax
-; X64-SSE42-NEXT: movups -32(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT: movups -16(%rsp,%rax), %xmm1
+; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: andl $3, %eax
+; X64-SSE42-NEXT: movups -72(%rsp,%rax,8), %xmm0
+; X64-SSE42-NEXT: movups -56(%rsp,%rax,8), %xmm1
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
; X64-SSE42-NEXT: retq
;
-; X64-AVX-LABEL: shl_32bytes:
+; X64-AVX-LABEL: lshr_32bytes_qwordOff:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-AVX-NEXT: movzbl (%rsi), %eax
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: andb $31, %al
-; X64-AVX-NEXT: negb %al
-; X64-AVX-NEXT: movsbq %al, %rax
-; X64-AVX-NEXT: vmovups -32(%rsp,%rax), %xmm0
-; X64-AVX-NEXT: vmovups -16(%rsp,%rax), %xmm1
+; X64-AVX-NEXT: andl $3, %eax
+; X64-AVX-NEXT: vmovups -72(%rsp,%rax,8), %xmm0
+; X64-AVX-NEXT: vmovups -56(%rsp,%rax,8), %xmm1
; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
;
-; X86-SSE2-LABEL: shl_32bytes:
+; X86-SSE2-LABEL: lshr_32bytes_qwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $72, %esp
+; X86-SSE2-NEXT: subl $92, %esp
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT: movl (%edi), %ecx
+; X86-SSE2-NEXT: movl (%eax), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 4(%eax), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 4(%edi), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 8(%edi), %esi
-; X86-SSE2-NEXT: movl 12(%edi), %ebx
-; X86-SSE2-NEXT: movl 16(%edi), %ebp
+; X86-SSE2-NEXT: movl 8(%eax), %esi
+; X86-SSE2-NEXT: movl 12(%eax), %edi
+; X86-SSE2-NEXT: movl 16(%eax), %ebx
+; X86-SSE2-NEXT: movl 20(%eax), %ebp
+; X86-SSE2-NEXT: movl 24(%eax), %edx
+; X86-SSE2-NEXT: movl 28(%eax), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movzbl (%eax), %eax
-; X86-SSE2-NEXT: movl 20(%edi), %edx
-; X86-SSE2-NEXT: movl 24(%edi), %ecx
-; X86-SSE2-NEXT: movl 28(%edi), %edi
-; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andb $31, %al
-; X86-SSE2-NEXT: negb %al
-; X86-SSE2-NEXT: movsbl %al, %edx
-; X86-SSE2-NEXT: movl 40(%esp,%edx), %eax
+; X86-SSE2-NEXT: andl $3, %eax
+; X86-SSE2-NEXT: movl 16(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 20(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 28(%esp,%eax,8), %esi
+; X86-SSE2-NEXT: movl 24(%esp,%eax,8), %edi
+; X86-SSE2-NEXT: movl 36(%esp,%eax,8), %ebx
+; X86-SSE2-NEXT: movl 32(%esp,%eax,8), %ebp
+; X86-SSE2-NEXT: movl 44(%esp,%eax,8), %edx
+; X86-SSE2-NEXT: movl 40(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl %ecx, 24(%eax)
+; X86-SSE2-NEXT: movl %edx, 28(%eax)
+; X86-SSE2-NEXT: movl %ebp, 16(%eax)
+; X86-SSE2-NEXT: movl %ebx, 20(%eax)
+; X86-SSE2-NEXT: movl %edi, 8(%eax)
+; X86-SSE2-NEXT: movl %esi, 12(%eax)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl %ecx, (%eax)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl %ecx, 4(%eax)
+; X86-SSE2-NEXT: addl $92, %esp
+; X86-SSE2-NEXT: popl %esi
+; X86-SSE2-NEXT: popl %edi
+; X86-SSE2-NEXT: popl %ebx
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: lshr_32bytes_qwordOff:
+; X86-SSE42: # %bb.0:
+; X86-SSE42-NEXT: subl $76, %esp
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE42-NEXT: movups (%edx), %xmm0
+; X86-SSE42-NEXT: movups 16(%edx), %xmm1
+; X86-SSE42-NEXT: movzbl (%ecx), %ecx
+; X86-SSE42-NEXT: xorps %xmm2, %xmm2
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, (%esp)
+; X86-SSE42-NEXT: andl $3, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0
+; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1
+; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
+; X86-SSE42-NEXT: movups %xmm0, (%eax)
+; X86-SSE42-NEXT: addl $76, %esp
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: lshr_32bytes_qwordOff:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: subl $76, %esp
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT: vmovups (%edx), %ymm0
+; X86-AVX-NEXT: movzbl (%ecx), %ecx
+; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovups %ymm0, (%esp)
+; X86-AVX-NEXT: andl $3, %ecx
+; X86-AVX-NEXT: vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
+; X86-AVX-NEXT: vmovups %xmm0, (%eax)
+; X86-AVX-NEXT: addl $76, %esp
+; X86-AVX-NEXT: vzeroupper
+; X86-AVX-NEXT: retl
+ %src = load i256, ptr %src.ptr, align 1
+ %qwordOff = load i256, ptr %qwordOff.ptr, align 1
+ %bitOff = shl i256 %qwordOff, 6
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; FALLBACK0-LABEL: shl_32bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rcx
+; FALLBACK0-NEXT: movq 8(%rdi), %r8
+; FALLBACK0-NEXT: movq 16(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: leal (,%rsi,8), %eax
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: andb $24, %sil
+; FALLBACK0-NEXT: negb %sil
+; FALLBACK0-NEXT: movsbq %sil, %r10
+; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8
+; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi
+; FALLBACK0-NEXT: movq %rdi, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq %r8, %r9
+; FALLBACK0-NEXT: shrq %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r9
+; FALLBACK0-NEXT: orq %r11, %r9
+; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10
+; FALLBACK0-NEXT: movq %r10, %rbx
+; FALLBACK0-NEXT: shrq %rbx
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: orq %r11, %rbx
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: shrq %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rdi
+; FALLBACK0-NEXT: orq %r10, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: movq %r8, (%rdx)
+; FALLBACK0-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK0-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK0-NEXT: movq %r9, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: shl_32bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq (%rdi), %rax
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: andb $24, %sil
+; FALLBACK1-NEXT: negb %sil
+; FALLBACK1-NEXT: movsbq %sil, %rax
+; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK1-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK1-NEXT: shldq %cl, %r8, %rax
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: shlq %cl, %r8
+; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK1-NEXT: movq %r8, (%rdx)
+; FALLBACK1-NEXT: movq %rax, 8(%rdx)
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: shl_32bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: leal (,%rsi,8), %eax
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: andb $24, %sil
+; FALLBACK2-NEXT: negb %sil
+; FALLBACK2-NEXT: movsbq %sil, %rsi
+; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8
+; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9
+; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: shrq %rdi
+; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r8, %rdi
+; FALLBACK2-NEXT: shrq %rsi
+; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r9, %rsi
+; FALLBACK2-NEXT: shrq %rcx
+; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, (%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: shl_32bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq (%rdi), %rax
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: andb $24, %sil
+; FALLBACK3-NEXT: negb %sil
+; FALLBACK3-NEXT: movsbq %sil, %rax
+; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK3-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK3-NEXT: shldq %cl, %r8, %rax
+; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx
+; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK3-NEXT: movq %rcx, (%rdx)
+; FALLBACK3-NEXT: movq %rax, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: shl_32bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movzbl (%rsi), %ecx
+; FALLBACK4-NEXT: leal (,%rcx,8), %eax
+; FALLBACK4-NEXT: xorps %xmm2, %xmm2
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: andb $24, %cl
+; FALLBACK4-NEXT: negb %cl
+; FALLBACK4-NEXT: movsbq %cl, %r8
+; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r9
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK4-NEXT: movq %r10, %rdi
+; FALLBACK4-NEXT: shrq %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rdi
+; FALLBACK4-NEXT: orq %r9, %rdi
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK4-NEXT: movq %r8, %r11
+; FALLBACK4-NEXT: shrq %r11
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: orq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r8
+; FALLBACK4-NEXT: movq %r9, %r10
+; FALLBACK4-NEXT: shrq %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: orq %r8, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r9
+; FALLBACK4-NEXT: movq %r9, (%rdx)
+; FALLBACK4-NEXT: movq %r10, 8(%rdx)
+; FALLBACK4-NEXT: movq %r11, 16(%rdx)
+; FALLBACK4-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: shl_32bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movzbl (%rsi), %eax
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: xorps %xmm2, %xmm2
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: andb $24, %al
+; FALLBACK5-NEXT: negb %al
+; FALLBACK5-NEXT: movsbq %al, %rax
+; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK5-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK5-NEXT: movq %r8, %r9
+; FALLBACK5-NEXT: shlq %cl, %r9
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: shldq %cl, %r8, %rax
+; FALLBACK5-NEXT: movq %rax, 8(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK5-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: shl_32bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movzbl (%rsi), %ecx
+; FALLBACK6-NEXT: leal (,%rcx,8), %eax
+; FALLBACK6-NEXT: xorps %xmm2, %xmm2
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: andb $24, %cl
+; FALLBACK6-NEXT: negb %cl
+; FALLBACK6-NEXT: movsbq %cl, %rcx
+; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK6-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: shrq %rdi
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: shrq %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT: orq %r8, %rcx
+; FALLBACK6-NEXT: shrq %r9
+; FALLBACK6-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, (%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: shl_32bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movzbl (%rsi), %eax
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: xorps %xmm2, %xmm2
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: andb $24, %al
+; FALLBACK7-NEXT: negb %al
+; FALLBACK7-NEXT: movsbq %al, %rax
+; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK7-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT: shldq %cl, %r8, %rax
+; FALLBACK7-NEXT: movq %rax, 8(%rdx)
+; FALLBACK7-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK7-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: shl_32bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: movzbl (%rsi), %ecx
+; FALLBACK8-NEXT: leal (,%rcx,8), %eax
+; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: andb $24, %cl
+; FALLBACK8-NEXT: negb %cl
+; FALLBACK8-NEXT: movsbq %cl, %r8
+; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r9
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK8-NEXT: movq %r10, %rdi
+; FALLBACK8-NEXT: shrq %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rdi
+; FALLBACK8-NEXT: orq %r9, %rdi
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK8-NEXT: movq %r8, %r11
+; FALLBACK8-NEXT: shrq %r11
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: orq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r8
+; FALLBACK8-NEXT: movq %r9, %r10
+; FALLBACK8-NEXT: shrq %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: orq %r8, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, (%rdx)
+; FALLBACK8-NEXT: movq %r10, 8(%rdx)
+; FALLBACK8-NEXT: movq %r11, 16(%rdx)
+; FALLBACK8-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: shl_32bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: movzbl (%rsi), %eax
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: andb $24, %al
+; FALLBACK9-NEXT: negb %al
+; FALLBACK9-NEXT: movsbq %al, %rax
+; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK9-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK9-NEXT: movq %r8, %r9
+; FALLBACK9-NEXT: shlq %cl, %r9
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: shldq %cl, %r8, %rax
+; FALLBACK9-NEXT: movq %rax, 8(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: shl_32bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: movzbl (%rsi), %ecx
+; FALLBACK10-NEXT: leal (,%rcx,8), %eax
+; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: andb $24, %cl
+; FALLBACK10-NEXT: negb %cl
+; FALLBACK10-NEXT: movsbq %cl, %rcx
+; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK10-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: shrq %rdi
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: shrq %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT: orq %r8, %rcx
+; FALLBACK10-NEXT: shrq %r9
+; FALLBACK10-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, (%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: shl_32bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: movzbl (%rsi), %eax
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: andb $24, %al
+; FALLBACK11-NEXT: negb %al
+; FALLBACK11-NEXT: movsbq %al, %rax
+; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK11-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT: shldq %cl, %r8, %rax
+; FALLBACK11-NEXT: movq %rax, 8(%rdx)
+; FALLBACK11-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK11-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: shl_32bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT: movzbl (%rsi), %ecx
+; FALLBACK12-NEXT: leal (,%rcx,8), %eax
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: andb $24, %cl
+; FALLBACK12-NEXT: negb %cl
+; FALLBACK12-NEXT: movsbq %cl, %r8
+; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r9
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK12-NEXT: movq %r10, %rdi
+; FALLBACK12-NEXT: shrq %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rdi
+; FALLBACK12-NEXT: orq %r9, %rdi
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK12-NEXT: movq %r8, %r11
+; FALLBACK12-NEXT: shrq %r11
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: orq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r8
+; FALLBACK12-NEXT: movq %r9, %r10
+; FALLBACK12-NEXT: shrq %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: orq %r8, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, (%rdx)
+; FALLBACK12-NEXT: movq %r10, 8(%rdx)
+; FALLBACK12-NEXT: movq %r11, 16(%rdx)
+; FALLBACK12-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: shl_32bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT: movzbl (%rsi), %eax
+; FALLBACK13-NEXT: leal (,%rax,8), %ecx
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: andb $24, %al
+; FALLBACK13-NEXT: negb %al
+; FALLBACK13-NEXT: movsbq %al, %rax
+; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK13-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK13-NEXT: movq %r8, %r9
+; FALLBACK13-NEXT: shlq %cl, %r9
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: shldq %cl, %r8, %rax
+; FALLBACK13-NEXT: movq %rax, 8(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK13-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: shl_32bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT: movzbl (%rsi), %ecx
+; FALLBACK14-NEXT: leal (,%rcx,8), %eax
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: andb $24, %cl
+; FALLBACK14-NEXT: negb %cl
+; FALLBACK14-NEXT: movsbq %cl, %rcx
+; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK14-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: shrq %rdi
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: shrq %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT: orq %r8, %rcx
+; FALLBACK14-NEXT: shrq %r9
+; FALLBACK14-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, (%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: shl_32bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT: movzbl (%rsi), %eax
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: andb $24, %al
+; FALLBACK15-NEXT: negb %al
+; FALLBACK15-NEXT: movsbq %al, %rax
+; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK15-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT: shldq %cl, %r8, %rax
+; FALLBACK15-NEXT: movq %rax, 8(%rdx)
+; FALLBACK15-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK15-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: shl_32bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $108, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl (%ecx), %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%ecx), %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%ecx), %esi
+; FALLBACK16-NEXT: movl 12(%ecx), %edi
+; FALLBACK16-NEXT: movl 16(%ecx), %ebx
+; FALLBACK16-NEXT: movb (%eax), %ah
+; FALLBACK16-NEXT: movl 20(%ecx), %ebp
+; FALLBACK16-NEXT: movl 24(%ecx), %edx
+; FALLBACK16-NEXT: movl 28(%ecx), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movb %ah, %ch
+; FALLBACK16-NEXT: shlb $3, %ch
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $28, %ah
+; FALLBACK16-NEXT: negb %ah
+; FALLBACK16-NEXT: movsbl %ah, %ebx
+; FALLBACK16-NEXT: movl 64(%esp,%ebx), %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 68(%esp,%ebx), %eax
+; FALLBACK16-NEXT: movl %eax, %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movb %ch, %dl
+; FALLBACK16-NEXT: notb %dl
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %esi, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 76(%esp,%ebx), %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: movl 72(%esp,%ebx), %esi
+; FALLBACK16-NEXT: movl %esi, %ebp
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %edi, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: shrl %eax
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: orl %esi, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 84(%esp,%ebx), %esi
+; FALLBACK16-NEXT: movl %esi, %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: movl 80(%esp,%ebx), %edi
+; FALLBACK16-NEXT: movl %edi, %ebp
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %eax
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: orl %edi, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 92(%esp,%ebx), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: movl 88(%esp,%ebx), %edi
+; FALLBACK16-NEXT: movl %edi, %ebx
+; FALLBACK16-NEXT: shrl %ebx
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: orl %eax, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: shrl %esi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: orl %edi, %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl %edx, (%eax)
+; FALLBACK16-NEXT: movl %esi, 24(%eax)
+; FALLBACK16-NEXT: movl %ebx, 28(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 16(%eax)
+; FALLBACK16-NEXT: movl %ebp, 20(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 8(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 12(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 4(%eax)
+; FALLBACK16-NEXT: addl $108, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: shl_32bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $92, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl (%eax), %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%eax), %edx
+; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%eax), %esi
+; FALLBACK17-NEXT: movl 12(%eax), %edi
+; FALLBACK17-NEXT: movl 16(%eax), %ebx
+; FALLBACK17-NEXT: movb (%ecx), %ch
+; FALLBACK17-NEXT: movl 20(%eax), %ebp
+; FALLBACK17-NEXT: movl 24(%eax), %edx
+; FALLBACK17-NEXT: movl 28(%eax), %eax
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movb %ch, %cl
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: andb $28, %ch
+; FALLBACK17-NEXT: negb %ch
+; FALLBACK17-NEXT: movsbl %ch, %eax
+; FALLBACK17-NEXT: movl 56(%esp,%eax), %edx
+; FALLBACK17-NEXT: movl 60(%esp,%eax), %ebx
+; FALLBACK17-NEXT: movl %ebx, %esi
+; FALLBACK17-NEXT: shldl %cl, %edx, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 52(%esp,%eax), %esi
+; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 64(%esp,%eax), %edi
+; FALLBACK17-NEXT: movl 68(%esp,%eax), %ebp
+; FALLBACK17-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK17-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK17-NEXT: movl 48(%esp,%eax), %ebx
+; FALLBACK17-NEXT: movl 72(%esp,%eax), %edx
+; FALLBACK17-NEXT: movl 76(%esp,%eax), %esi
+; FALLBACK17-NEXT: shldl %cl, %edx, %esi
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: shldl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl %edx, 24(%eax)
+; FALLBACK17-NEXT: movl %esi, 28(%eax)
+; FALLBACK17-NEXT: movl %edi, 16(%eax)
+; FALLBACK17-NEXT: movl %ebp, 20(%eax)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, 8(%eax)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, 12(%eax)
+; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK17-NEXT: shll %cl, %ebx
+; FALLBACK17-NEXT: movl %ebx, (%eax)
+; FALLBACK17-NEXT: movl %edx, 4(%eax)
+; FALLBACK17-NEXT: addl $92, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: shl_32bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $108, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%eax), %esi
+; FALLBACK18-NEXT: movl 12(%eax), %edi
+; FALLBACK18-NEXT: movl 16(%eax), %ebp
+; FALLBACK18-NEXT: movzbl (%ebx), %ebx
+; FALLBACK18-NEXT: movl 20(%eax), %edx
+; FALLBACK18-NEXT: movl 24(%eax), %ecx
+; FALLBACK18-NEXT: movl 28(%eax), %eax
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, %edx
+; FALLBACK18-NEXT: shlb $3, %dl
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: andb $28, %bl
+; FALLBACK18-NEXT: negb %bl
+; FALLBACK18-NEXT: movsbl %bl, %esi
+; FALLBACK18-NEXT: movl 64(%esp,%esi), %ebx
+; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 68(%esp,%esi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, %eax, %edi
+; FALLBACK18-NEXT: movl %edx, %ecx
+; FALLBACK18-NEXT: notb %cl
+; FALLBACK18-NEXT: shrl %ebx
+; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebx
+; FALLBACK18-NEXT: orl %edi, %ebx
+; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 72(%esp,%esi), %ebx
+; FALLBACK18-NEXT: movl %ebx, %edi
+; FALLBACK18-NEXT: shrl %edi
+; FALLBACK18-NEXT: shrxl %ecx, %edi, %eax
+; FALLBACK18-NEXT: movl 76(%esp,%esi), %edi
+; FALLBACK18-NEXT: shlxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebx
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: shrl %eax
+; FALLBACK18-NEXT: shrxl %ecx, %eax, %eax
+; FALLBACK18-NEXT: orl %ebx, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 80(%esp,%esi), %ebx
+; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %ebx
+; FALLBACK18-NEXT: shrxl %ecx, %ebx, %eax
+; FALLBACK18-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %edi
+; FALLBACK18-NEXT: shrxl %ecx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, 92(%esp,%esi), %ebp
+; FALLBACK18-NEXT: movl 88(%esp,%esi), %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %eax
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi
+; FALLBACK18-NEXT: orl %ebp, %esi
+; FALLBACK18-NEXT: shrl %ebx
+; FALLBACK18-NEXT: shrxl %ecx, %ebx, %edx
+; FALLBACK18-NEXT: orl %eax, %edx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, (%eax)
+; FALLBACK18-NEXT: movl %edx, 24(%eax)
+; FALLBACK18-NEXT: movl %esi, 28(%eax)
+; FALLBACK18-NEXT: movl %edi, 16(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $108, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: shl_32bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $92, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%ecx), %esi
+; FALLBACK19-NEXT: movl 12(%ecx), %edi
+; FALLBACK19-NEXT: movl 16(%ecx), %ebp
+; FALLBACK19-NEXT: movzbl (%ebx), %ebx
+; FALLBACK19-NEXT: movl 20(%ecx), %edx
+; FALLBACK19-NEXT: movl 24(%ecx), %eax
+; FALLBACK19-NEXT: movl 28(%ecx), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: andb $28, %bl
+; FALLBACK19-NEXT: negb %bl
+; FALLBACK19-NEXT: movsbl %bl, %eax
+; FALLBACK19-NEXT: movl 56(%esp,%eax), %edx
+; FALLBACK19-NEXT: movl 60(%esp,%eax), %esi
+; FALLBACK19-NEXT: movl %esi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %edx, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 52(%esp,%eax), %ebx
+; FALLBACK19-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 64(%esp,%eax), %edi
+; FALLBACK19-NEXT: movl 68(%esp,%eax), %ebp
+; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK19-NEXT: shldl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl 48(%esp,%eax), %edx
+; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 72(%esp,%eax), %edx
+; FALLBACK19-NEXT: movl 76(%esp,%eax), %esi
+; FALLBACK19-NEXT: shldl %cl, %edx, %esi
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: shldl %cl, %eax, %edx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: movl %edx, 24(%eax)
+; FALLBACK19-NEXT: movl %esi, 28(%eax)
+; FALLBACK19-NEXT: movl %edi, 16(%eax)
+; FALLBACK19-NEXT: movl %ebp, 20(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, 8(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, 12(%eax)
+; FALLBACK19-NEXT: movl (%esp), %esi # 4-byte Reload
+; FALLBACK19-NEXT: shlxl %ecx, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, (%eax)
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK19-NEXT: movl %ebx, 4(%eax)
+; FALLBACK19-NEXT: addl $92, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: shl_32bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $108, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movzbl (%eax), %ecx
+; FALLBACK20-NEXT: movb %cl, %dh
+; FALLBACK20-NEXT: shlb $3, %dh
+; FALLBACK20-NEXT: xorps %xmm2, %xmm2
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $28, %cl
+; FALLBACK20-NEXT: negb %cl
+; FALLBACK20-NEXT: movsbl %cl, %eax
+; FALLBACK20-NEXT: movl 84(%esp,%eax), %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: movb %dh, %dl
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: movl 80(%esp,%eax), %esi
+; FALLBACK20-NEXT: movl %eax, %ebx
+; FALLBACK20-NEXT: movl %esi, %eax
+; FALLBACK20-NEXT: shrl %eax
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: orl %edi, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl %ebx, %edi
+; FALLBACK20-NEXT: movl 76(%esp,%ebx), %ebp
+; FALLBACK20-NEXT: movl %ebp, %eax
+; FALLBACK20-NEXT: shrl %eax
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: orl %esi, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: movl 72(%esp,%ebx), %ebx
+; FALLBACK20-NEXT: movl %ebx, %eax
+; FALLBACK20-NEXT: shrl %eax
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 68(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %esi
+; FALLBACK20-NEXT: shrl %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: movl 64(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: shrl %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl 88(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %edi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shrl %eax
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: orl %edi, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl 92(%esp,%eax), %edi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %edi, %ebp
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %edx, (%eax)
+; FALLBACK20-NEXT: movl %ebp, 28(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 24(%eax)
+; FALLBACK20-NEXT: movl %ebx, 4(%eax)
+; FALLBACK20-NEXT: movl %esi, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 16(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: addl $108, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: shl_32bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $92, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movups (%ecx), %xmm0
+; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT: movzbl (%eax), %eax
+; FALLBACK21-NEXT: movl %eax, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: xorps %xmm2, %xmm2
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: andb $28, %al
+; FALLBACK21-NEXT: negb %al
+; FALLBACK21-NEXT: movsbl %al, %ebp
+; FALLBACK21-NEXT: movl 64(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl 68(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 60(%esp,%ebp), %edx
+; FALLBACK21-NEXT: shldl %cl, %edx, %eax
+; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edi
+; FALLBACK21-NEXT: shldl %cl, %edi, %edx
+; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT: movl 52(%esp,%ebp), %ebx
+; FALLBACK21-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK21-NEXT: movl 72(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl %edx, %eax
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK21-NEXT: shldl %cl, %esi, %eax
+; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 76(%esp,%ebp), %ebp
+; FALLBACK21-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT: movl %ebp, 28(%edx)
+; FALLBACK21-NEXT: movl %eax, 24(%edx)
+; FALLBACK21-NEXT: movl %esi, %eax
+; FALLBACK21-NEXT: shll %cl, %eax
+; FALLBACK21-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK21-NEXT: movl %ebx, 4(%edx)
+; FALLBACK21-NEXT: movl %edi, 8(%edx)
+; FALLBACK21-NEXT: movl (%esp), %ecx # 4-byte Reload
+; FALLBACK21-NEXT: movl %ecx, 12(%edx)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT: movl %ecx, 16(%edx)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT: movl %ecx, 20(%edx)
+; FALLBACK21-NEXT: movl %eax, (%edx)
+; FALLBACK21-NEXT: addl $92, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: shl_32bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $108, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: shlb $3, %al
+; FALLBACK22-NEXT: xorps %xmm2, %xmm2
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: andb $28, %cl
+; FALLBACK22-NEXT: negb %cl
+; FALLBACK22-NEXT: movsbl %cl, %edx
+; FALLBACK22-NEXT: movl 84(%esp,%edx), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx
+; FALLBACK22-NEXT: movl 80(%esp,%edx), %esi
+; FALLBACK22-NEXT: shlxl %eax, %esi, %edi
+; FALLBACK22-NEXT: movl %eax, %ebx
+; FALLBACK22-NEXT: notb %bl
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 76(%esp,%edx), %ecx
+; FALLBACK22-NEXT: movl %ecx, %esi
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %edi, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx
+; FALLBACK22-NEXT: movl 72(%esp,%edx), %esi
+; FALLBACK22-NEXT: movl %esi, %edi
+; FALLBACK22-NEXT: shrl %edi
+; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi
+; FALLBACK22-NEXT: orl %ecx, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %eax, %esi, %ecx
+; FALLBACK22-NEXT: movl 68(%esp,%edx), %esi
+; FALLBACK22-NEXT: movl %esi, %edi
+; FALLBACK22-NEXT: shrl %edi
+; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ecx, %ebp
+; FALLBACK22-NEXT: shlxl %eax, %esi, %edi
+; FALLBACK22-NEXT: movl 64(%esp,%edx), %esi
+; FALLBACK22-NEXT: movl %esi, %ecx
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %edi, %ecx
+; FALLBACK22-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %eax, 92(%esp,%edx), %edi
+; FALLBACK22-NEXT: movl 88(%esp,%edx), %edx
+; FALLBACK22-NEXT: shlxl %eax, %edx, %esi
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: shrl %eax
+; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: shrl %edx
+; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK22-NEXT: orl %edi, %edx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK22-NEXT: movl %edi, (%esi)
+; FALLBACK22-NEXT: movl %edx, 28(%esi)
+; FALLBACK22-NEXT: movl %eax, 24(%esi)
+; FALLBACK22-NEXT: movl %ecx, 4(%esi)
+; FALLBACK22-NEXT: movl %ebp, 8(%esi)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 12(%esi)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 16(%esi)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 20(%esi)
+; FALLBACK22-NEXT: addl $108, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: shl_32bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $92, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movups (%ecx), %xmm0
+; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT: movzbl (%eax), %eax
+; FALLBACK23-NEXT: movl %eax, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: xorps %xmm2, %xmm2
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: andb $28, %al
+; FALLBACK23-NEXT: negb %al
+; FALLBACK23-NEXT: movsbl %al, %ebx
+; FALLBACK23-NEXT: movl 64(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl 68(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %eax, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 60(%esp,%ebx), %edx
+; FALLBACK23-NEXT: shldl %cl, %edx, %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 56(%esp,%ebx), %edi
+; FALLBACK23-NEXT: shldl %cl, %edi, %edx
+; FALLBACK23-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK23-NEXT: movl 52(%esp,%ebx), %ebp
+; FALLBACK23-NEXT: shldl %cl, %ebp, %edi
+; FALLBACK23-NEXT: movl 72(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl %edx, %eax
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: shldl %cl, %esi, %eax
+; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK23-NEXT: movl 76(%esp,%ebx), %ebx
+; FALLBACK23-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT: movl %ebx, 28(%edx)
+; FALLBACK23-NEXT: movl %eax, 24(%edx)
+; FALLBACK23-NEXT: shlxl %ecx, %esi, %eax
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: shldl %cl, %esi, %ebp
+; FALLBACK23-NEXT: movl %ebp, 4(%edx)
+; FALLBACK23-NEXT: movl %edi, 8(%edx)
+; FALLBACK23-NEXT: movl (%esp), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 12(%edx)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 16(%edx)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 20(%edx)
+; FALLBACK23-NEXT: movl %eax, (%edx)
+; FALLBACK23-NEXT: addl $92, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: shl_32bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $108, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: movzbl (%eax), %ecx
+; FALLBACK24-NEXT: movb %cl, %dh
+; FALLBACK24-NEXT: shlb $3, %dh
+; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $28, %cl
+; FALLBACK24-NEXT: negb %cl
+; FALLBACK24-NEXT: movsbl %cl, %eax
+; FALLBACK24-NEXT: movl 84(%esp,%eax), %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: movb %dh, %dl
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: movl 80(%esp,%eax), %esi
+; FALLBACK24-NEXT: movl %eax, %ebx
+; FALLBACK24-NEXT: movl %esi, %eax
+; FALLBACK24-NEXT: shrl %eax
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: orl %edi, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl %ebx, %edi
+; FALLBACK24-NEXT: movl 76(%esp,%ebx), %ebp
+; FALLBACK24-NEXT: movl %ebp, %eax
+; FALLBACK24-NEXT: shrl %eax
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: orl %esi, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: movl 72(%esp,%ebx), %ebx
+; FALLBACK24-NEXT: movl %ebx, %eax
+; FALLBACK24-NEXT: shrl %eax
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 68(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %esi
+; FALLBACK24-NEXT: shrl %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: movl 64(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: shrl %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl 88(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %edi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shrl %eax
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: orl %edi, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl 92(%esp,%eax), %edi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %edi, %ebp
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %edx, (%eax)
+; FALLBACK24-NEXT: movl %ebp, 28(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 24(%eax)
+; FALLBACK24-NEXT: movl %ebx, 4(%eax)
+; FALLBACK24-NEXT: movl %esi, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 16(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: addl $108, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: shl_32bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $92, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT: movzbl (%eax), %eax
+; FALLBACK25-NEXT: movl %eax, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: andb $28, %al
+; FALLBACK25-NEXT: negb %al
+; FALLBACK25-NEXT: movsbl %al, %ebp
+; FALLBACK25-NEXT: movl 64(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl 68(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 60(%esp,%ebp), %edx
+; FALLBACK25-NEXT: shldl %cl, %edx, %eax
+; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edi
+; FALLBACK25-NEXT: shldl %cl, %edi, %edx
+; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT: movl 52(%esp,%ebp), %ebx
+; FALLBACK25-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK25-NEXT: movl 72(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl %edx, %eax
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK25-NEXT: shldl %cl, %esi, %eax
+; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 76(%esp,%ebp), %ebp
+; FALLBACK25-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT: movl %ebp, 28(%edx)
+; FALLBACK25-NEXT: movl %eax, 24(%edx)
+; FALLBACK25-NEXT: movl %esi, %eax
+; FALLBACK25-NEXT: shll %cl, %eax
+; FALLBACK25-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK25-NEXT: movl %ebx, 4(%edx)
+; FALLBACK25-NEXT: movl %edi, 8(%edx)
+; FALLBACK25-NEXT: movl (%esp), %ecx # 4-byte Reload
+; FALLBACK25-NEXT: movl %ecx, 12(%edx)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT: movl %ecx, 16(%edx)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT: movl %ecx, 20(%edx)
+; FALLBACK25-NEXT: movl %eax, (%edx)
+; FALLBACK25-NEXT: addl $92, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: shl_32bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $108, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %eax
+; FALLBACK26-NEXT: shlb $3, %al
+; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: andb $28, %cl
+; FALLBACK26-NEXT: negb %cl
+; FALLBACK26-NEXT: movsbl %cl, %edx
+; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx
+; FALLBACK26-NEXT: movl 80(%esp,%edx), %esi
+; FALLBACK26-NEXT: shlxl %eax, %esi, %edi
+; FALLBACK26-NEXT: movl %eax, %ebx
+; FALLBACK26-NEXT: notb %bl
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %ecx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 76(%esp,%edx), %ecx
+; FALLBACK26-NEXT: movl %ecx, %esi
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %edi, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx
+; FALLBACK26-NEXT: movl 72(%esp,%edx), %esi
+; FALLBACK26-NEXT: movl %esi, %edi
+; FALLBACK26-NEXT: shrl %edi
+; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi
+; FALLBACK26-NEXT: orl %ecx, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %eax, %esi, %ecx
+; FALLBACK26-NEXT: movl 68(%esp,%edx), %esi
+; FALLBACK26-NEXT: movl %esi, %edi
+; FALLBACK26-NEXT: shrl %edi
+; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ecx, %ebp
+; FALLBACK26-NEXT: shlxl %eax, %esi, %edi
+; FALLBACK26-NEXT: movl 64(%esp,%edx), %esi
+; FALLBACK26-NEXT: movl %esi, %ecx
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %edi, %ecx
+; FALLBACK26-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %eax, 92(%esp,%edx), %edi
+; FALLBACK26-NEXT: movl 88(%esp,%edx), %edx
+; FALLBACK26-NEXT: shlxl %eax, %edx, %esi
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: shrl %eax
+; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: shrl %edx
+; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK26-NEXT: orl %edi, %edx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK26-NEXT: movl %edi, (%esi)
+; FALLBACK26-NEXT: movl %edx, 28(%esi)
+; FALLBACK26-NEXT: movl %eax, 24(%esi)
+; FALLBACK26-NEXT: movl %ecx, 4(%esi)
+; FALLBACK26-NEXT: movl %ebp, 8(%esi)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 12(%esi)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 16(%esi)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 20(%esi)
+; FALLBACK26-NEXT: addl $108, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: shl_32bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $92, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT: movzbl (%eax), %eax
+; FALLBACK27-NEXT: movl %eax, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: andb $28, %al
+; FALLBACK27-NEXT: negb %al
+; FALLBACK27-NEXT: movsbl %al, %ebx
+; FALLBACK27-NEXT: movl 64(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl 68(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %eax, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 60(%esp,%ebx), %edx
+; FALLBACK27-NEXT: shldl %cl, %edx, %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 56(%esp,%ebx), %edi
+; FALLBACK27-NEXT: shldl %cl, %edi, %edx
+; FALLBACK27-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK27-NEXT: movl 52(%esp,%ebx), %ebp
+; FALLBACK27-NEXT: shldl %cl, %ebp, %edi
+; FALLBACK27-NEXT: movl 72(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl %edx, %eax
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: shldl %cl, %esi, %eax
+; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK27-NEXT: movl 76(%esp,%ebx), %ebx
+; FALLBACK27-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT: movl %ebx, 28(%edx)
+; FALLBACK27-NEXT: movl %eax, 24(%edx)
+; FALLBACK27-NEXT: shlxl %ecx, %esi, %eax
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: shldl %cl, %esi, %ebp
+; FALLBACK27-NEXT: movl %ebp, 4(%edx)
+; FALLBACK27-NEXT: movl %edi, 8(%edx)
+; FALLBACK27-NEXT: movl (%esp), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 12(%edx)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 16(%edx)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 20(%edx)
+; FALLBACK27-NEXT: movl %eax, (%edx)
+; FALLBACK27-NEXT: addl $92, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: shl_32bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $108, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT: movzbl (%eax), %ecx
+; FALLBACK28-NEXT: movb %cl, %dh
+; FALLBACK28-NEXT: shlb $3, %dh
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $28, %cl
+; FALLBACK28-NEXT: negb %cl
+; FALLBACK28-NEXT: movsbl %cl, %eax
+; FALLBACK28-NEXT: movl 84(%esp,%eax), %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: movb %dh, %dl
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: movl 80(%esp,%eax), %esi
+; FALLBACK28-NEXT: movl %eax, %ebx
+; FALLBACK28-NEXT: movl %esi, %eax
+; FALLBACK28-NEXT: shrl %eax
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: orl %edi, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl %ebx, %edi
+; FALLBACK28-NEXT: movl 76(%esp,%ebx), %ebp
+; FALLBACK28-NEXT: movl %ebp, %eax
+; FALLBACK28-NEXT: shrl %eax
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: orl %esi, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: movl 72(%esp,%ebx), %ebx
+; FALLBACK28-NEXT: movl %ebx, %eax
+; FALLBACK28-NEXT: shrl %eax
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 68(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %esi
+; FALLBACK28-NEXT: shrl %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: movl 64(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: shrl %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl 88(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %edi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shrl %eax
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: orl %edi, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl 92(%esp,%eax), %edi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %edi, %ebp
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %edx, (%eax)
+; FALLBACK28-NEXT: movl %ebp, 28(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 24(%eax)
+; FALLBACK28-NEXT: movl %ebx, 4(%eax)
+; FALLBACK28-NEXT: movl %esi, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 16(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: addl $108, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: shl_32bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $92, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK29-NEXT: movzbl (%eax), %eax
+; FALLBACK29-NEXT: movl %eax, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: andb $28, %al
+; FALLBACK29-NEXT: negb %al
+; FALLBACK29-NEXT: movsbl %al, %ebp
+; FALLBACK29-NEXT: movl 64(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl 68(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 60(%esp,%ebp), %edx
+; FALLBACK29-NEXT: shldl %cl, %edx, %eax
+; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edi
+; FALLBACK29-NEXT: shldl %cl, %edi, %edx
+; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT: movl 52(%esp,%ebp), %ebx
+; FALLBACK29-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK29-NEXT: movl 72(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl %edx, %eax
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK29-NEXT: shldl %cl, %esi, %eax
+; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 76(%esp,%ebp), %ebp
+; FALLBACK29-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT: movl %ebp, 28(%edx)
+; FALLBACK29-NEXT: movl %eax, 24(%edx)
+; FALLBACK29-NEXT: movl %esi, %eax
+; FALLBACK29-NEXT: shll %cl, %eax
+; FALLBACK29-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK29-NEXT: movl %ebx, 4(%edx)
+; FALLBACK29-NEXT: movl %edi, 8(%edx)
+; FALLBACK29-NEXT: movl (%esp), %ecx # 4-byte Reload
+; FALLBACK29-NEXT: movl %ecx, 12(%edx)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT: movl %ecx, 16(%edx)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT: movl %ecx, 20(%edx)
+; FALLBACK29-NEXT: movl %eax, (%edx)
+; FALLBACK29-NEXT: addl $92, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: shl_32bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $108, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %eax
+; FALLBACK30-NEXT: shlb $3, %al
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: andb $28, %cl
+; FALLBACK30-NEXT: negb %cl
+; FALLBACK30-NEXT: movsbl %cl, %edx
+; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx
+; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi
+; FALLBACK30-NEXT: shlxl %eax, %esi, %edi
+; FALLBACK30-NEXT: movl %eax, %ebx
+; FALLBACK30-NEXT: notb %bl
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %ecx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 76(%esp,%edx), %ecx
+; FALLBACK30-NEXT: movl %ecx, %esi
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %edi, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx
+; FALLBACK30-NEXT: movl 72(%esp,%edx), %esi
+; FALLBACK30-NEXT: movl %esi, %edi
+; FALLBACK30-NEXT: shrl %edi
+; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi
+; FALLBACK30-NEXT: orl %ecx, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %eax, %esi, %ecx
+; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi
+; FALLBACK30-NEXT: movl %esi, %edi
+; FALLBACK30-NEXT: shrl %edi
+; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ecx, %ebp
+; FALLBACK30-NEXT: shlxl %eax, %esi, %edi
+; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi
+; FALLBACK30-NEXT: movl %esi, %ecx
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %edi, %ecx
+; FALLBACK30-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %eax, 92(%esp,%edx), %edi
+; FALLBACK30-NEXT: movl 88(%esp,%edx), %edx
+; FALLBACK30-NEXT: shlxl %eax, %edx, %esi
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: shrl %eax
+; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: shrl %edx
+; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK30-NEXT: orl %edi, %edx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK30-NEXT: movl %edi, (%esi)
+; FALLBACK30-NEXT: movl %edx, 28(%esi)
+; FALLBACK30-NEXT: movl %eax, 24(%esi)
+; FALLBACK30-NEXT: movl %ecx, 4(%esi)
+; FALLBACK30-NEXT: movl %ebp, 8(%esi)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 12(%esi)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 16(%esi)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 20(%esi)
+; FALLBACK30-NEXT: addl $108, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: shl_32bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $92, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK31-NEXT: movzbl (%eax), %eax
+; FALLBACK31-NEXT: movl %eax, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: andb $28, %al
+; FALLBACK31-NEXT: negb %al
+; FALLBACK31-NEXT: movsbl %al, %ebx
+; FALLBACK31-NEXT: movl 64(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl 68(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %eax, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 60(%esp,%ebx), %edx
+; FALLBACK31-NEXT: shldl %cl, %edx, %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 56(%esp,%ebx), %edi
+; FALLBACK31-NEXT: shldl %cl, %edi, %edx
+; FALLBACK31-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK31-NEXT: movl 52(%esp,%ebx), %ebp
+; FALLBACK31-NEXT: shldl %cl, %ebp, %edi
+; FALLBACK31-NEXT: movl 72(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl %edx, %eax
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: shldl %cl, %esi, %eax
+; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK31-NEXT: movl 76(%esp,%ebx), %ebx
+; FALLBACK31-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT: movl %ebx, 28(%edx)
+; FALLBACK31-NEXT: movl %eax, 24(%edx)
+; FALLBACK31-NEXT: shlxl %ecx, %esi, %eax
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: shldl %cl, %esi, %ebp
+; FALLBACK31-NEXT: movl %ebp, 4(%edx)
+; FALLBACK31-NEXT: movl %edi, 8(%edx)
+; FALLBACK31-NEXT: movl (%esp), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 12(%edx)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 16(%edx)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 20(%edx)
+; FALLBACK31-NEXT: movl %eax, (%edx)
+; FALLBACK31-NEXT: addl $92, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; FALLBACK0-LABEL: shl_32bytes_dwordOff:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rcx
+; FALLBACK0-NEXT: movq 8(%rdi), %r8
+; FALLBACK0-NEXT: movq 16(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: movl %esi, %eax
+; FALLBACK0-NEXT: shlb $5, %al
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: shlb $2, %sil
+; FALLBACK0-NEXT: andb $24, %sil
+; FALLBACK0-NEXT: negb %sil
+; FALLBACK0-NEXT: movsbq %sil, %r10
+; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8
+; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi
+; FALLBACK0-NEXT: movq %rdi, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq %r8, %r9
+; FALLBACK0-NEXT: shrq %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r9
+; FALLBACK0-NEXT: orq %r11, %r9
+; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10
+; FALLBACK0-NEXT: movq %r10, %rbx
+; FALLBACK0-NEXT: shrq %rbx
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: orq %r11, %rbx
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: shrq %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rdi
+; FALLBACK0-NEXT: orq %r10, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: movq %r8, (%rdx)
+; FALLBACK0-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK0-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK0-NEXT: movq %r9, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: shl_32bytes_dwordOff:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq (%rdi), %rax
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: movl %esi, %ecx
+; FALLBACK1-NEXT: shlb $5, %cl
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: shlb $2, %sil
+; FALLBACK1-NEXT: andb $24, %sil
+; FALLBACK1-NEXT: negb %sil
+; FALLBACK1-NEXT: movsbq %sil, %rax
+; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK1-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK1-NEXT: shldq %cl, %r8, %rax
+; FALLBACK1-NEXT: shlq %cl, %r8
+; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK1-NEXT: movq %r8, (%rdx)
+; FALLBACK1-NEXT: movq %rax, 8(%rdx)
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: shl_32bytes_dwordOff:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: movl %esi, %eax
+; FALLBACK2-NEXT: shlb $5, %al
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: shlb $2, %sil
+; FALLBACK2-NEXT: andb $24, %sil
+; FALLBACK2-NEXT: negb %sil
+; FALLBACK2-NEXT: movsbq %sil, %rsi
+; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8
+; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9
+; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: shrq %rdi
+; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r8, %rdi
+; FALLBACK2-NEXT: shrq %rsi
+; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r9, %rsi
+; FALLBACK2-NEXT: shrq %rcx
+; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, (%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: shl_32bytes_dwordOff:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq (%rdi), %rax
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: movl %esi, %ecx
+; FALLBACK3-NEXT: shlb $5, %cl
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: shlb $2, %sil
+; FALLBACK3-NEXT: andb $24, %sil
+; FALLBACK3-NEXT: negb %sil
+; FALLBACK3-NEXT: movsbq %sil, %rax
+; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK3-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK3-NEXT: shldq %cl, %r8, %rax
+; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx
+; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK3-NEXT: movq %rcx, (%rdx)
+; FALLBACK3-NEXT: movq %rax, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: shl_32bytes_dwordOff:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movzbl (%rsi), %ecx
+; FALLBACK4-NEXT: movl %ecx, %eax
+; FALLBACK4-NEXT: shlb $5, %al
+; FALLBACK4-NEXT: xorps %xmm2, %xmm2
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: shlb $2, %cl
+; FALLBACK4-NEXT: andb $24, %cl
+; FALLBACK4-NEXT: negb %cl
+; FALLBACK4-NEXT: movsbq %cl, %r8
+; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r9
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK4-NEXT: movq %r10, %rdi
+; FALLBACK4-NEXT: shrq %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rdi
+; FALLBACK4-NEXT: orq %r9, %rdi
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK4-NEXT: movq %r8, %r11
+; FALLBACK4-NEXT: shrq %r11
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: orq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r8
+; FALLBACK4-NEXT: movq %r9, %r10
+; FALLBACK4-NEXT: shrq %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: orq %r8, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r9
+; FALLBACK4-NEXT: movq %r9, (%rdx)
+; FALLBACK4-NEXT: movq %r10, 8(%rdx)
+; FALLBACK4-NEXT: movq %r11, 16(%rdx)
+; FALLBACK4-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: shl_32bytes_dwordOff:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movzbl (%rsi), %eax
+; FALLBACK5-NEXT: movl %eax, %ecx
+; FALLBACK5-NEXT: shlb $5, %cl
+; FALLBACK5-NEXT: xorps %xmm2, %xmm2
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: shlb $2, %al
+; FALLBACK5-NEXT: andb $24, %al
+; FALLBACK5-NEXT: negb %al
+; FALLBACK5-NEXT: movsbq %al, %rax
+; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK5-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK5-NEXT: movq %r8, %r9
+; FALLBACK5-NEXT: shlq %cl, %r9
+; FALLBACK5-NEXT: shldq %cl, %r8, %rax
+; FALLBACK5-NEXT: movq %rax, 8(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK5-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: shl_32bytes_dwordOff:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movzbl (%rsi), %ecx
+; FALLBACK6-NEXT: movl %ecx, %eax
+; FALLBACK6-NEXT: shlb $5, %al
+; FALLBACK6-NEXT: xorps %xmm2, %xmm2
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: shlb $2, %cl
+; FALLBACK6-NEXT: andb $24, %cl
+; FALLBACK6-NEXT: negb %cl
+; FALLBACK6-NEXT: movsbq %cl, %rcx
+; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK6-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: shrq %rdi
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: shrq %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT: orq %r8, %rcx
+; FALLBACK6-NEXT: shrq %r9
+; FALLBACK6-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, (%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: shl_32bytes_dwordOff:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movzbl (%rsi), %eax
+; FALLBACK7-NEXT: movl %eax, %ecx
+; FALLBACK7-NEXT: shlb $5, %cl
+; FALLBACK7-NEXT: xorps %xmm2, %xmm2
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: shlb $2, %al
+; FALLBACK7-NEXT: andb $24, %al
+; FALLBACK7-NEXT: negb %al
+; FALLBACK7-NEXT: movsbq %al, %rax
+; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK7-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT: shldq %cl, %r8, %rax
+; FALLBACK7-NEXT: movq %rax, 8(%rdx)
+; FALLBACK7-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK7-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: shl_32bytes_dwordOff:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: movzbl (%rsi), %ecx
+; FALLBACK8-NEXT: movl %ecx, %eax
+; FALLBACK8-NEXT: shlb $5, %al
+; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: shlb $2, %cl
+; FALLBACK8-NEXT: andb $24, %cl
+; FALLBACK8-NEXT: negb %cl
+; FALLBACK8-NEXT: movsbq %cl, %r8
+; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r9
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK8-NEXT: movq %r10, %rdi
+; FALLBACK8-NEXT: shrq %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rdi
+; FALLBACK8-NEXT: orq %r9, %rdi
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK8-NEXT: movq %r8, %r11
+; FALLBACK8-NEXT: shrq %r11
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: orq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r8
+; FALLBACK8-NEXT: movq %r9, %r10
+; FALLBACK8-NEXT: shrq %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: orq %r8, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, (%rdx)
+; FALLBACK8-NEXT: movq %r10, 8(%rdx)
+; FALLBACK8-NEXT: movq %r11, 16(%rdx)
+; FALLBACK8-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: shl_32bytes_dwordOff:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: movzbl (%rsi), %eax
+; FALLBACK9-NEXT: movl %eax, %ecx
+; FALLBACK9-NEXT: shlb $5, %cl
+; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: shlb $2, %al
+; FALLBACK9-NEXT: andb $24, %al
+; FALLBACK9-NEXT: negb %al
+; FALLBACK9-NEXT: movsbq %al, %rax
+; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK9-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK9-NEXT: movq %r8, %r9
+; FALLBACK9-NEXT: shlq %cl, %r9
+; FALLBACK9-NEXT: shldq %cl, %r8, %rax
+; FALLBACK9-NEXT: movq %rax, 8(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: shl_32bytes_dwordOff:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: movzbl (%rsi), %ecx
+; FALLBACK10-NEXT: movl %ecx, %eax
+; FALLBACK10-NEXT: shlb $5, %al
+; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: shlb $2, %cl
+; FALLBACK10-NEXT: andb $24, %cl
+; FALLBACK10-NEXT: negb %cl
+; FALLBACK10-NEXT: movsbq %cl, %rcx
+; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK10-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: shrq %rdi
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: shrq %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT: orq %r8, %rcx
+; FALLBACK10-NEXT: shrq %r9
+; FALLBACK10-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, (%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: shl_32bytes_dwordOff:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: movzbl (%rsi), %eax
+; FALLBACK11-NEXT: movl %eax, %ecx
+; FALLBACK11-NEXT: shlb $5, %cl
+; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: shlb $2, %al
+; FALLBACK11-NEXT: andb $24, %al
+; FALLBACK11-NEXT: negb %al
+; FALLBACK11-NEXT: movsbq %al, %rax
+; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK11-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT: shldq %cl, %r8, %rax
+; FALLBACK11-NEXT: movq %rax, 8(%rdx)
+; FALLBACK11-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK11-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: shl_32bytes_dwordOff:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT: movzbl (%rsi), %ecx
+; FALLBACK12-NEXT: movl %ecx, %eax
+; FALLBACK12-NEXT: shlb $5, %al
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: shlb $2, %cl
+; FALLBACK12-NEXT: andb $24, %cl
+; FALLBACK12-NEXT: negb %cl
+; FALLBACK12-NEXT: movsbq %cl, %r8
+; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r9
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK12-NEXT: movq %r10, %rdi
+; FALLBACK12-NEXT: shrq %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rdi
+; FALLBACK12-NEXT: orq %r9, %rdi
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK12-NEXT: movq %r8, %r11
+; FALLBACK12-NEXT: shrq %r11
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: orq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r8
+; FALLBACK12-NEXT: movq %r9, %r10
+; FALLBACK12-NEXT: shrq %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: orq %r8, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, (%rdx)
+; FALLBACK12-NEXT: movq %r10, 8(%rdx)
+; FALLBACK12-NEXT: movq %r11, 16(%rdx)
+; FALLBACK12-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: shl_32bytes_dwordOff:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT: movzbl (%rsi), %eax
+; FALLBACK13-NEXT: movl %eax, %ecx
+; FALLBACK13-NEXT: shlb $5, %cl
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: shlb $2, %al
+; FALLBACK13-NEXT: andb $24, %al
+; FALLBACK13-NEXT: negb %al
+; FALLBACK13-NEXT: movsbq %al, %rax
+; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK13-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK13-NEXT: movq %r8, %r9
+; FALLBACK13-NEXT: shlq %cl, %r9
+; FALLBACK13-NEXT: shldq %cl, %r8, %rax
+; FALLBACK13-NEXT: movq %rax, 8(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK13-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: shl_32bytes_dwordOff:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT: movzbl (%rsi), %ecx
+; FALLBACK14-NEXT: movl %ecx, %eax
+; FALLBACK14-NEXT: shlb $5, %al
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: shlb $2, %cl
+; FALLBACK14-NEXT: andb $24, %cl
+; FALLBACK14-NEXT: negb %cl
+; FALLBACK14-NEXT: movsbq %cl, %rcx
+; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK14-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: shrq %rdi
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: shrq %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT: orq %r8, %rcx
+; FALLBACK14-NEXT: shrq %r9
+; FALLBACK14-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, (%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: shl_32bytes_dwordOff:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT: movzbl (%rsi), %eax
+; FALLBACK15-NEXT: movl %eax, %ecx
+; FALLBACK15-NEXT: shlb $5, %cl
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: shlb $2, %al
+; FALLBACK15-NEXT: andb $24, %al
+; FALLBACK15-NEXT: negb %al
+; FALLBACK15-NEXT: movsbq %al, %rax
+; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK15-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT: shldq %cl, %r8, %rax
+; FALLBACK15-NEXT: movq %rax, 8(%rdx)
+; FALLBACK15-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK15-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; X86-SSE2-LABEL: shl_32bytes_dwordOff:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: pushl %ebx
+; X86-SSE2-NEXT: pushl %edi
+; X86-SSE2-NEXT: pushl %esi
+; X86-SSE2-NEXT: subl $92, %esp
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-SSE2-NEXT: movl (%ebp), %eax
; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 44(%esp,%edx), %eax
-; X86-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 52(%esp,%edx), %esi
-; X86-SSE2-NEXT: movl 48(%esp,%edx), %edi
-; X86-SSE2-NEXT: movl 60(%esp,%edx), %ebx
-; X86-SSE2-NEXT: movl 56(%esp,%edx), %ebp
-; X86-SSE2-NEXT: movl 68(%esp,%edx), %ecx
-; X86-SSE2-NEXT: movl 64(%esp,%edx), %edx
+; X86-SSE2-NEXT: movl 4(%ebp), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 8(%ebp), %esi
+; X86-SSE2-NEXT: movl 12(%ebp), %edi
+; X86-SSE2-NEXT: movl 16(%ebp), %ebx
+; X86-SSE2-NEXT: movzbl (%ecx), %ecx
+; X86-SSE2-NEXT: movl 20(%ebp), %edx
+; X86-SSE2-NEXT: movl 24(%ebp), %eax
+; X86-SSE2-NEXT: movl 28(%ebp), %ebp
+; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: shlb $2, %cl
+; X86-SSE2-NEXT: andb $28, %cl
+; X86-SSE2-NEXT: negb %cl
+; X86-SSE2-NEXT: movsbl %cl, %edx
+; X86-SSE2-NEXT: movl 48(%esp,%edx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 52(%esp,%edx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 60(%esp,%edx), %esi
+; X86-SSE2-NEXT: movl 56(%esp,%edx), %edi
+; X86-SSE2-NEXT: movl 68(%esp,%edx), %ebx
+; X86-SSE2-NEXT: movl 64(%esp,%edx), %ebp
+; X86-SSE2-NEXT: movl 76(%esp,%edx), %ecx
+; X86-SSE2-NEXT: movl 72(%esp,%edx), %edx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl %edx, 24(%eax)
; X86-SSE2-NEXT: movl %ecx, 28(%eax)
@@ -1368,18 +8554,205 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %esi, 12(%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, (%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $72, %esp
+; X86-SSE2-NEXT: addl $92, %esp
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: shl_32bytes:
+; X86-SSE42-LABEL: shl_32bytes_dwordOff:
; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $64, %esp
+; X86-SSE42-NEXT: subl $76, %esp
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE42-NEXT: movups (%edx), %xmm0
+; X86-SSE42-NEXT: movups 16(%edx), %xmm1
+; X86-SSE42-NEXT: movzbl (%ecx), %ecx
+; X86-SSE42-NEXT: xorps %xmm2, %xmm2
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm2, (%esp)
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: shlb $2, %cl
+; X86-SSE42-NEXT: andb $28, %cl
+; X86-SSE42-NEXT: negb %cl
+; X86-SSE42-NEXT: movsbl %cl, %ecx
+; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm0
+; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm1
+; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
+; X86-SSE42-NEXT: movups %xmm0, (%eax)
+; X86-SSE42-NEXT: addl $76, %esp
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: shl_32bytes_dwordOff:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: subl $76, %esp
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT: vmovups (%edx), %ymm0
+; X86-AVX-NEXT: movzbl (%ecx), %ecx
+; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT: vmovups %ymm1, (%esp)
+; X86-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: shlb $2, %cl
+; X86-AVX-NEXT: andb $28, %cl
+; X86-AVX-NEXT: negb %cl
+; X86-AVX-NEXT: movsbl %cl, %ecx
+; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm0
+; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm1
+; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
+; X86-AVX-NEXT: vmovups %xmm0, (%eax)
+; X86-AVX-NEXT: addl $76, %esp
+; X86-AVX-NEXT: vzeroupper
+; X86-AVX-NEXT: retl
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 5
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: shl_32bytes_qwordOff:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movq (%rdi), %rax
+; X64-SSE2-NEXT: movq 8(%rdi), %rcx
+; X64-SSE2-NEXT: movq 16(%rdi), %r8
+; X64-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: shlb $3, %sil
+; X64-SSE2-NEXT: andb $24, %sil
+; X64-SSE2-NEXT: negb %sil
+; X64-SSE2-NEXT: movsbq %sil, %rax
+; X64-SSE2-NEXT: movq -40(%rsp,%rax), %rcx
+; X64-SSE2-NEXT: movq -32(%rsp,%rax), %rsi
+; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-SSE2-NEXT: movq -24(%rsp,%rax), %rax
+; X64-SSE2-NEXT: movq %rax, 16(%rdx)
+; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
+; X64-SSE2-NEXT: movq %rcx, (%rdx)
+; X64-SSE2-NEXT: movq %rsi, 8(%rdx)
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: shl_32bytes_qwordOff:
+; X64-SSE42: # %bb.0:
+; X64-SSE42-NEXT: movups (%rdi), %xmm0
+; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
+; X64-SSE42-NEXT: movzbl (%rsi), %eax
+; X64-SSE42-NEXT: xorps %xmm2, %xmm2
+; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: shlb $3, %al
+; X64-SSE42-NEXT: andb $24, %al
+; X64-SSE42-NEXT: negb %al
+; X64-SSE42-NEXT: movsbq %al, %rax
+; X64-SSE42-NEXT: movups -40(%rsp,%rax), %xmm0
+; X64-SSE42-NEXT: movups -24(%rsp,%rax), %xmm1
+; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
+; X64-SSE42-NEXT: movups %xmm0, (%rdx)
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: shl_32bytes_qwordOff:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-AVX-NEXT: movzbl (%rsi), %eax
+; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: shlb $3, %al
+; X64-AVX-NEXT: andb $24, %al
+; X64-AVX-NEXT: negb %al
+; X64-AVX-NEXT: movsbq %al, %rax
+; X64-AVX-NEXT: vmovups -40(%rsp,%rax), %xmm0
+; X64-AVX-NEXT: vmovups -24(%rsp,%rax), %xmm1
+; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
+; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: retq
+;
+; X86-SSE2-LABEL: shl_32bytes_qwordOff:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: pushl %ebx
+; X86-SSE2-NEXT: pushl %edi
+; X86-SSE2-NEXT: pushl %esi
+; X86-SSE2-NEXT: subl $92, %esp
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-SSE2-NEXT: movl (%ebp), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 4(%ebp), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 8(%ebp), %esi
+; X86-SSE2-NEXT: movl 12(%ebp), %edi
+; X86-SSE2-NEXT: movl 16(%ebp), %ebx
+; X86-SSE2-NEXT: movzbl (%ecx), %ecx
+; X86-SSE2-NEXT: movl 20(%ebp), %edx
+; X86-SSE2-NEXT: movl 24(%ebp), %eax
+; X86-SSE2-NEXT: movl 28(%ebp), %ebp
+; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: shlb $3, %cl
+; X86-SSE2-NEXT: andb $24, %cl
+; X86-SSE2-NEXT: negb %cl
+; X86-SSE2-NEXT: movsbl %cl, %edx
+; X86-SSE2-NEXT: movl 48(%esp,%edx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 52(%esp,%edx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 60(%esp,%edx), %esi
+; X86-SSE2-NEXT: movl 56(%esp,%edx), %edi
+; X86-SSE2-NEXT: movl 68(%esp,%edx), %ebx
+; X86-SSE2-NEXT: movl 64(%esp,%edx), %ebp
+; X86-SSE2-NEXT: movl 76(%esp,%edx), %ecx
+; X86-SSE2-NEXT: movl 72(%esp,%edx), %edx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl %edx, 24(%eax)
+; X86-SSE2-NEXT: movl %ecx, 28(%eax)
+; X86-SSE2-NEXT: movl %ebp, 16(%eax)
+; X86-SSE2-NEXT: movl %ebx, 20(%eax)
+; X86-SSE2-NEXT: movl %edi, 8(%eax)
+; X86-SSE2-NEXT: movl %esi, 12(%eax)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl %ecx, (%eax)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl %ecx, 4(%eax)
+; X86-SSE2-NEXT: addl $92, %esp
+; X86-SSE2-NEXT: popl %esi
+; X86-SSE2-NEXT: popl %edi
+; X86-SSE2-NEXT: popl %ebx
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: shl_32bytes_qwordOff:
+; X86-SSE42: # %bb.0:
+; X86-SSE42-NEXT: subl $76, %esp
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -1387,50 +8760,3063 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movups 16(%edx), %xmm1
; X86-SSE42-NEXT: movzbl (%ecx), %ecx
; X86-SSE42-NEXT: xorps %xmm2, %xmm2
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, (%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andb $31, %cl
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm2, (%esp)
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: shlb $3, %cl
+; X86-SSE42-NEXT: andb $24, %cl
; X86-SSE42-NEXT: negb %cl
; X86-SSE42-NEXT: movsbl %cl, %ecx
; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm0
; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm1
; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
; X86-SSE42-NEXT: movups %xmm0, (%eax)
+; X86-SSE42-NEXT: addl $76, %esp
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: shl_32bytes_qwordOff:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: subl $76, %esp
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT: vmovups (%edx), %ymm0
+; X86-AVX-NEXT: movzbl (%ecx), %ecx
+; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT: vmovups %ymm1, (%esp)
+; X86-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: shlb $3, %cl
+; X86-AVX-NEXT: andb $24, %cl
+; X86-AVX-NEXT: negb %cl
+; X86-AVX-NEXT: movsbl %cl, %ecx
+; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm0
+; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm1
+; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
+; X86-AVX-NEXT: vmovups %xmm0, (%eax)
+; X86-AVX-NEXT: addl $76, %esp
+; X86-AVX-NEXT: vzeroupper
+; X86-AVX-NEXT: retl
+ %src = load i256, ptr %src.ptr, align 1
+ %qwordOff = load i256, ptr %qwordOff.ptr, align 1
+ %bitOff = shl i256 %qwordOff, 6
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; FALLBACK0-LABEL: ashr_32bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rcx
+; FALLBACK0-NEXT: movq 8(%rdi), %r8
+; FALLBACK0-NEXT: movq 16(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: leal (,%rsi,8), %eax
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: sarq $63, %rdi
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: andb $24, %sil
+; FALLBACK0-NEXT: movzbl %sil, %r9d
+; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi
+; FALLBACK0-NEXT: movq %rdi, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: orq %r11, %r8
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: addq %rdi, %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rdi
+; FALLBACK0-NEXT: orq %r10, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK0-NEXT: leaq (%r9,%r9), %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %rbx, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: sarq %cl, %r9
+; FALLBACK0-NEXT: movq %r9, 24(%rdx)
+; FALLBACK0-NEXT: movq %r10, 16(%rdx)
+; FALLBACK0-NEXT: movq %rdi, (%rdx)
+; FALLBACK0-NEXT: movq %r8, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: ashr_32bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq (%rdi), %rax
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: sarq $63, %rdi
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: andb $24, %sil
+; FALLBACK1-NEXT: movzbl %sil, %eax
+; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8
+; FALLBACK1-NEXT: movq %r8, %r9
+; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax
+; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: sarq %cl, %rax
+; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rax, 24(%rdx)
+; FALLBACK1-NEXT: movq %rdi, (%rdx)
+; FALLBACK1-NEXT: movq %r9, 8(%rdx)
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: ashr_32bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: leal (,%rsi,8), %eax
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: sarq $63, %rdi
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: andb $24, %sil
+; FALLBACK2-NEXT: movzbl %sil, %ecx
+; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi
+; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi
+; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9
+; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: addq %rdi, %rdi
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r8, %rdi
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r9, %rsi
+; FALLBACK2-NEXT: addq %rcx, %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, 24(%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rsi, (%rdx)
+; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: ashr_32bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq (%rdi), %rax
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: sarq $63, %rdi
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: andb $24, %sil
+; FALLBACK3-NEXT: movzbl %sil, %eax
+; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8
+; FALLBACK3-NEXT: movq %r8, %r9
+; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax
+; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rax, 24(%rdx)
+; FALLBACK3-NEXT: movq %rdi, (%rdx)
+; FALLBACK3-NEXT: movq %r9, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: ashr_32bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movq 16(%rdi), %rcx
+; FALLBACK4-NEXT: movq 24(%rdi), %rdi
+; FALLBACK4-NEXT: movzbl (%rsi), %esi
+; FALLBACK4-NEXT: leal (,%rsi,8), %eax
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: sarq $63, %rdi
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: andb $24, %sil
+; FALLBACK4-NEXT: movzbl %sil, %r9d
+; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rdi
+; FALLBACK4-NEXT: orq %r10, %rdi
+; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK4-NEXT: movq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r11, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r8, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: sarq %cl, %r9
+; FALLBACK4-NEXT: movq %r9, 24(%rdx)
+; FALLBACK4-NEXT: movq %r10, 8(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT: movq %rdi, (%rdx)
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: ashr_32bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movq 16(%rdi), %rax
+; FALLBACK5-NEXT: movq 24(%rdi), %rdi
+; FALLBACK5-NEXT: movzbl (%rsi), %esi
+; FALLBACK5-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: sarq $63, %rdi
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: andb $24, %sil
+; FALLBACK5-NEXT: movzbl %sil, %eax
+; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: movq %rdi, %r8
+; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK5-NEXT: movq %rax, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: sarq %cl, %rsi
+; FALLBACK5-NEXT: movq %r10, 8(%rdx)
+; FALLBACK5-NEXT: movq %r8, 16(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: ashr_32bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movq 16(%rdi), %rcx
+; FALLBACK6-NEXT: movq 24(%rdi), %rdi
+; FALLBACK6-NEXT: movzbl (%rsi), %esi
+; FALLBACK6-NEXT: leal (,%rsi,8), %eax
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: sarq $63, %rdi
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: andb $24, %sil
+; FALLBACK6-NEXT: movzbl %sil, %ecx
+; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK6-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT: orq %r9, %rcx
+; FALLBACK6-NEXT: addq %r8, %r8
+; FALLBACK6-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, (%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: ashr_32bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movq 16(%rdi), %rax
+; FALLBACK7-NEXT: movq 24(%rdi), %rdi
+; FALLBACK7-NEXT: movzbl (%rsi), %esi
+; FALLBACK7-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: sarq $63, %rdi
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: andb $24, %sil
+; FALLBACK7-NEXT: movzbl %sil, %eax
+; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: movq %rdi, %r8
+; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK7-NEXT: movq %rax, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax
+; FALLBACK7-NEXT: movq %r10, 8(%rdx)
+; FALLBACK7-NEXT: movq %r8, 16(%rdx)
+; FALLBACK7-NEXT: movq %rax, 24(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: ashr_32bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK8-NEXT: movq 16(%rdi), %rcx
+; FALLBACK8-NEXT: movq 24(%rdi), %rdi
+; FALLBACK8-NEXT: movzbl (%rsi), %esi
+; FALLBACK8-NEXT: leal (,%rsi,8), %eax
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: sarq $63, %rdi
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: andb $24, %sil
+; FALLBACK8-NEXT: movzbl %sil, %r9d
+; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rdi
+; FALLBACK8-NEXT: orq %r10, %rdi
+; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r11, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r8, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: sarq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, 24(%rdx)
+; FALLBACK8-NEXT: movq %r10, 8(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT: movq %rdi, (%rdx)
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: ashr_32bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK9-NEXT: movq 16(%rdi), %rax
+; FALLBACK9-NEXT: movq 24(%rdi), %rdi
+; FALLBACK9-NEXT: movzbl (%rsi), %esi
+; FALLBACK9-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: sarq $63, %rdi
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: andb $24, %sil
+; FALLBACK9-NEXT: movzbl %sil, %eax
+; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: movq %rdi, %r8
+; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK9-NEXT: movq %rax, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: sarq %cl, %rsi
+; FALLBACK9-NEXT: movq %r10, 8(%rdx)
+; FALLBACK9-NEXT: movq %r8, 16(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: ashr_32bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK10-NEXT: movq 16(%rdi), %rcx
+; FALLBACK10-NEXT: movq 24(%rdi), %rdi
+; FALLBACK10-NEXT: movzbl (%rsi), %esi
+; FALLBACK10-NEXT: leal (,%rsi,8), %eax
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: sarq $63, %rdi
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: andb $24, %sil
+; FALLBACK10-NEXT: movzbl %sil, %ecx
+; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK10-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT: orq %r9, %rcx
+; FALLBACK10-NEXT: addq %r8, %r8
+; FALLBACK10-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, (%rdx)
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: ashr_32bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK11-NEXT: movq 16(%rdi), %rax
+; FALLBACK11-NEXT: movq 24(%rdi), %rdi
+; FALLBACK11-NEXT: movzbl (%rsi), %esi
+; FALLBACK11-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: sarq $63, %rdi
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: andb $24, %sil
+; FALLBACK11-NEXT: movzbl %sil, %eax
+; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: movq %rdi, %r8
+; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK11-NEXT: movq %rax, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax
+; FALLBACK11-NEXT: movq %r10, 8(%rdx)
+; FALLBACK11-NEXT: movq %r8, 16(%rdx)
+; FALLBACK11-NEXT: movq %rax, 24(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: ashr_32bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK12-NEXT: movq 16(%rdi), %rcx
+; FALLBACK12-NEXT: movq 24(%rdi), %rdi
+; FALLBACK12-NEXT: movzbl (%rsi), %esi
+; FALLBACK12-NEXT: leal (,%rsi,8), %eax
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: sarq $63, %rdi
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: andb $24, %sil
+; FALLBACK12-NEXT: movzbl %sil, %r9d
+; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rdi
+; FALLBACK12-NEXT: orq %r10, %rdi
+; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r11, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r8, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: sarq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, 24(%rdx)
+; FALLBACK12-NEXT: movq %r10, 8(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT: movq %rdi, (%rdx)
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: ashr_32bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK13-NEXT: movq 16(%rdi), %rax
+; FALLBACK13-NEXT: movq 24(%rdi), %rdi
+; FALLBACK13-NEXT: movzbl (%rsi), %esi
+; FALLBACK13-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: sarq $63, %rdi
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: andb $24, %sil
+; FALLBACK13-NEXT: movzbl %sil, %eax
+; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK13-NEXT: movq %rdi, %r8
+; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK13-NEXT: movq %rax, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: sarq %cl, %rsi
+; FALLBACK13-NEXT: movq %r10, 8(%rdx)
+; FALLBACK13-NEXT: movq %r8, 16(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: ashr_32bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK14-NEXT: movq 16(%rdi), %rcx
+; FALLBACK14-NEXT: movq 24(%rdi), %rdi
+; FALLBACK14-NEXT: movzbl (%rsi), %esi
+; FALLBACK14-NEXT: leal (,%rsi,8), %eax
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: sarq $63, %rdi
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: andb $24, %sil
+; FALLBACK14-NEXT: movzbl %sil, %ecx
+; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK14-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: addq %rcx, %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT: orq %r9, %rcx
+; FALLBACK14-NEXT: addq %r8, %r8
+; FALLBACK14-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, (%rdx)
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: ashr_32bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK15-NEXT: movq 16(%rdi), %rax
+; FALLBACK15-NEXT: movq 24(%rdi), %rdi
+; FALLBACK15-NEXT: movzbl (%rsi), %esi
+; FALLBACK15-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: sarq $63, %rdi
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: andb $24, %sil
+; FALLBACK15-NEXT: movzbl %sil, %eax
+; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: movq %rdi, %r8
+; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK15-NEXT: movq %rax, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax
+; FALLBACK15-NEXT: movq %r10, 8(%rdx)
+; FALLBACK15-NEXT: movq %r8, 16(%rdx)
+; FALLBACK15-NEXT: movq %rax, 24(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: ashr_32bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $108, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK16-NEXT: movl (%esi), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%esi), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%esi), %ebx
+; FALLBACK16-NEXT: movl 12(%esi), %ebp
+; FALLBACK16-NEXT: movl 16(%esi), %edi
+; FALLBACK16-NEXT: movzbl (%eax), %ecx
+; FALLBACK16-NEXT: movl 20(%esi), %edx
+; FALLBACK16-NEXT: movl 24(%esi), %eax
+; FALLBACK16-NEXT: movl 28(%esi), %esi
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, %edx
+; FALLBACK16-NEXT: shlb $3, %dl
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: sarl $31, %esi
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $28, %cl
+; FALLBACK16-NEXT: movzbl %cl, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi
+; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax
+; FALLBACK16-NEXT: movl %eax, %ebx
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movb %dl, %ch
+; FALLBACK16-NEXT: notb %ch
+; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %ebx, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: addl %eax, %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %esi, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp
+; FALLBACK16-NEXT: movl %ebp, %esi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: movl %edx, %ebx
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: movl 48(%esp,%eax), %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%edx,%edx), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %esi, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %ebx, %edx
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: addl %ebp, %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %edi, %ebp
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK16-NEXT: movl 52(%esp,%esi), %edi
+; FALLBACK16-NEXT: movl %edi, %eax
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 56(%esp,%esi), %ebx
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %eax, %esi
+; FALLBACK16-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %edi, %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %eax, %edi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl 60(%esp,%eax), %eax
+; FALLBACK16-NEXT: leal (%eax,%eax), %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %ebx, %edx
+; FALLBACK16-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; FALLBACK16-NEXT: sarl %cl, %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl %eax, 28(%ecx)
+; FALLBACK16-NEXT: movl %edx, 24(%ecx)
+; FALLBACK16-NEXT: movl %edi, 16(%ecx)
+; FALLBACK16-NEXT: movl %esi, 20(%ecx)
+; FALLBACK16-NEXT: movl %ebp, 8(%ecx)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, 12(%ecx)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, (%ecx)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, 4(%ecx)
+; FALLBACK16-NEXT: addl $108, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: ashr_32bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $92, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%ecx), %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%ecx), %edx
+; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 12(%ecx), %ebp
+; FALLBACK17-NEXT: movl 16(%ecx), %ebx
+; FALLBACK17-NEXT: movzbl (%eax), %eax
+; FALLBACK17-NEXT: movl 20(%ecx), %edi
+; FALLBACK17-NEXT: movl 24(%ecx), %edx
+; FALLBACK17-NEXT: movl 28(%ecx), %esi
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, %ecx
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: sarl $31, %esi
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: andb $28, %al
+; FALLBACK17-NEXT: movzbl %al, %ebp
+; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx
+; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %esi
+; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi
+; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl %edx, 24(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT: sarl %cl, %eax
+; FALLBACK17-NEXT: movl %eax, 28(%ebp)
+; FALLBACK17-NEXT: movl %ebx, 16(%ebp)
+; FALLBACK17-NEXT: movl %edi, 20(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 8(%ebp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 12(%ebp)
+; FALLBACK17-NEXT: movl %esi, (%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 4(%ebp)
+; FALLBACK17-NEXT: addl $92, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: ashr_32bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $108, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK18-NEXT: movl (%esi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%esi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%esi), %ebx
+; FALLBACK18-NEXT: movl 12(%esi), %ebp
+; FALLBACK18-NEXT: movl 16(%esi), %edi
+; FALLBACK18-NEXT: movzbl (%ecx), %ecx
+; FALLBACK18-NEXT: movl 20(%esi), %edx
+; FALLBACK18-NEXT: movl 24(%esi), %eax
+; FALLBACK18-NEXT: movl 28(%esi), %esi
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, %eax
+; FALLBACK18-NEXT: shlb $3, %al
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: sarl $31, %esi
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: andb $28, %cl
+; FALLBACK18-NEXT: movzbl %cl, %edi
+; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx
+; FALLBACK18-NEXT: shrxl %eax, %esi, %ebx
+; FALLBACK18-NEXT: movl %eax, %edx
+; FALLBACK18-NEXT: notb %dl
+; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp
+; FALLBACK18-NEXT: shlxl %edx, %ebp, %ebp
+; FALLBACK18-NEXT: orl %ebx, %ebp
+; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%edi), %ebx
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK18-NEXT: orl %ebx, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 48(%esp,%edi), %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi
+; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp
+; FALLBACK18-NEXT: shrxl %eax, %ebp, %ebx
+; FALLBACK18-NEXT: orl %ebx, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %eax, %ecx, %ecx
+; FALLBACK18-NEXT: movl %eax, %ebx
+; FALLBACK18-NEXT: addl %ebp, %ebp
+; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: orl %ecx, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp
+; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi
+; FALLBACK18-NEXT: orl %esi, %ecx
+; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: addl %eax, %eax
+; FALLBACK18-NEXT: shlxl %edx, %eax, %esi
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK18-NEXT: sarxl %ebx, %edi, %ebx
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %edx, %edi, %edx
+; FALLBACK18-NEXT: orl %eax, %edx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl %ebx, 28(%eax)
+; FALLBACK18-NEXT: movl %edx, 24(%eax)
+; FALLBACK18-NEXT: movl %esi, 16(%eax)
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, (%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $108, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: ashr_32bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $92, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%ecx), %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%ecx), %edx
+; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 12(%ecx), %ebp
+; FALLBACK19-NEXT: movl 16(%ecx), %ebx
+; FALLBACK19-NEXT: movzbl (%eax), %eax
+; FALLBACK19-NEXT: movl 20(%ecx), %edi
+; FALLBACK19-NEXT: movl 24(%ecx), %edx
+; FALLBACK19-NEXT: movl 28(%ecx), %esi
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: sarl $31, %esi
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: andb $28, %al
+; FALLBACK19-NEXT: movzbl %al, %ebp
+; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %esi, %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx
+; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx
+; FALLBACK19-NEXT: movl %edx, %esi
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx
+; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi
+; FALLBACK19-NEXT: shrdl %cl, %edi, %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl %eax, 24(%ebp)
+; FALLBACK19-NEXT: sarxl %ecx, %edi, %eax
+; FALLBACK19-NEXT: movl %eax, 28(%ebp)
+; FALLBACK19-NEXT: movl %ebx, 16(%ebp)
+; FALLBACK19-NEXT: movl %esi, 20(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 8(%ebp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 12(%ebp)
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: movl %edx, (%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 4(%ebp)
+; FALLBACK19-NEXT: addl $92, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: ashr_32bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $108, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movl 16(%ecx), %esi
+; FALLBACK20-NEXT: movl 20(%ecx), %edi
+; FALLBACK20-NEXT: movl 24(%ecx), %ebx
+; FALLBACK20-NEXT: movl 28(%ecx), %edx
+; FALLBACK20-NEXT: movzbl (%eax), %eax
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shlb $3, %cl
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: sarl $31, %edx
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $28, %al
+; FALLBACK20-NEXT: movzbl %al, %edi
+; FALLBACK20-NEXT: movl 32(%esp,%edi), %eax
+; FALLBACK20-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: movl %ecx, %edx
+; FALLBACK20-NEXT: movb %cl, %dh
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %eax, %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %eax
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: movl 48(%esp,%edi), %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %eax, %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK20-NEXT: movl %esi, %eax
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %eax, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %eax
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %eax, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: addl %ebp, %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %eax, %ebp
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl 60(%esp,%edi), %eax
+; FALLBACK20-NEXT: leal (%eax,%eax), %edi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl %ebx, %edi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: sarl %cl, %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movl %eax, 28(%ecx)
+; FALLBACK20-NEXT: movl %esi, 4(%ecx)
+; FALLBACK20-NEXT: movl %edi, 24(%ecx)
+; FALLBACK20-NEXT: movl %ebp, 16(%ecx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, 20(%ecx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, 8(%ecx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, 12(%ecx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, (%ecx)
+; FALLBACK20-NEXT: addl $108, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: ashr_32bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $108, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movups (%ecx), %xmm0
+; FALLBACK21-NEXT: movl 16(%ecx), %esi
+; FALLBACK21-NEXT: movl 20(%ecx), %edi
+; FALLBACK21-NEXT: movl 24(%ecx), %ebx
+; FALLBACK21-NEXT: movl 28(%ecx), %edx
+; FALLBACK21-NEXT: movzbl (%eax), %eax
+; FALLBACK21-NEXT: movl %eax, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: sarl $31, %edx
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: andb $28, %al
+; FALLBACK21-NEXT: movzbl %al, %ebp
+; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl %edi, %esi
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %esi, 4(%ebp)
+; FALLBACK21-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT: sarl %cl, %eax
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %edx, (%ebp)
+; FALLBACK21-NEXT: addl $108, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: ashr_32bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $108, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movl 16(%ecx), %esi
+; FALLBACK22-NEXT: movl 20(%ecx), %edi
+; FALLBACK22-NEXT: movl 24(%ecx), %ebx
+; FALLBACK22-NEXT: movl 28(%ecx), %edx
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: shlb $3, %al
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: sarl $31, %edx
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: andb $28, %cl
+; FALLBACK22-NEXT: movzbl %cl, %edi
+; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %eax, %edx
+; FALLBACK22-NEXT: notb %dl
+; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi
+; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx
+; FALLBACK22-NEXT: orl %ebx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi
+; FALLBACK22-NEXT: movl 40(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx
+; FALLBACK22-NEXT: movl %eax, %ecx
+; FALLBACK22-NEXT: orl %ebx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK22-NEXT: shlxl %edx, %ebx, %eax
+; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK22-NEXT: shrxl %ecx, %ebx, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %ebx, %ebx
+; FALLBACK22-NEXT: shlxl %edx, %ebx, %ebx
+; FALLBACK22-NEXT: orl %ebp, %ebx
+; FALLBACK22-NEXT: shrxl %ecx, %esi, %ecx
+; FALLBACK22-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK22-NEXT: sarxl %eax, %edi, %eax
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK22-NEXT: orl %ecx, %edi
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %esi, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT: movl %eax, 28(%edx)
+; FALLBACK22-NEXT: movl %ecx, 4(%edx)
+; FALLBACK22-NEXT: movl %edi, 24(%edx)
+; FALLBACK22-NEXT: movl %ebx, 16(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 20(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 8(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 12(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, (%edx)
+; FALLBACK22-NEXT: addl $108, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: ashr_32bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $108, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movups (%ecx), %xmm0
+; FALLBACK23-NEXT: movl 16(%ecx), %esi
+; FALLBACK23-NEXT: movl 20(%ecx), %edi
+; FALLBACK23-NEXT: movl 24(%ecx), %ebx
+; FALLBACK23-NEXT: movl 28(%ecx), %edx
+; FALLBACK23-NEXT: movzbl (%eax), %eax
+; FALLBACK23-NEXT: movl %eax, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: sarl $31, %edx
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: andb $28, %al
+; FALLBACK23-NEXT: movzbl %al, %ebx
+; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, %edi
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl %ebx, 4(%eax)
+; FALLBACK23-NEXT: movl %ebp, 24(%eax)
+; FALLBACK23-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK23-NEXT: movl %ebx, 28(%eax)
+; FALLBACK23-NEXT: movl %esi, 16(%eax)
+; FALLBACK23-NEXT: movl %edi, 20(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: movl %esi, 8(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: movl %esi, 12(%eax)
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, (%eax)
+; FALLBACK23-NEXT: addl $108, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: ashr_32bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $108, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK24-NEXT: movl 16(%ecx), %esi
+; FALLBACK24-NEXT: movl 20(%ecx), %edi
+; FALLBACK24-NEXT: movl 24(%ecx), %ebx
+; FALLBACK24-NEXT: movl 28(%ecx), %edx
+; FALLBACK24-NEXT: movzbl (%eax), %eax
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shlb $3, %cl
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: sarl $31, %edx
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $28, %al
+; FALLBACK24-NEXT: movzbl %al, %edi
+; FALLBACK24-NEXT: movl 32(%esp,%edi), %eax
+; FALLBACK24-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: movl %ecx, %edx
+; FALLBACK24-NEXT: movb %cl, %dh
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %eax, %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %eax
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: movl 48(%esp,%edi), %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %eax, %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK24-NEXT: movl %esi, %eax
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %eax, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %eax
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %eax, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: addl %ebp, %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %eax, %ebp
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl 60(%esp,%edi), %eax
+; FALLBACK24-NEXT: leal (%eax,%eax), %edi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl %ebx, %edi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: sarl %cl, %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: movl %eax, 28(%ecx)
+; FALLBACK24-NEXT: movl %esi, 4(%ecx)
+; FALLBACK24-NEXT: movl %edi, 24(%ecx)
+; FALLBACK24-NEXT: movl %ebp, 16(%ecx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, 20(%ecx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, 8(%ecx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, 12(%ecx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, (%ecx)
+; FALLBACK24-NEXT: addl $108, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: ashr_32bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $108, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK25-NEXT: movl 16(%ecx), %esi
+; FALLBACK25-NEXT: movl 20(%ecx), %edi
+; FALLBACK25-NEXT: movl 24(%ecx), %ebx
+; FALLBACK25-NEXT: movl 28(%ecx), %edx
+; FALLBACK25-NEXT: movzbl (%eax), %eax
+; FALLBACK25-NEXT: movl %eax, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: sarl $31, %edx
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: andb $28, %al
+; FALLBACK25-NEXT: movzbl %al, %ebp
+; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl %edi, %esi
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %esi, 4(%ebp)
+; FALLBACK25-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT: sarl %cl, %eax
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %edx, (%ebp)
+; FALLBACK25-NEXT: addl $108, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: ashr_32bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $108, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT: movl 16(%ecx), %esi
+; FALLBACK26-NEXT: movl 20(%ecx), %edi
+; FALLBACK26-NEXT: movl 24(%ecx), %ebx
+; FALLBACK26-NEXT: movl 28(%ecx), %edx
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %eax
+; FALLBACK26-NEXT: shlb $3, %al
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: sarl $31, %edx
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: andb $28, %cl
+; FALLBACK26-NEXT: movzbl %cl, %edi
+; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %eax, %edx
+; FALLBACK26-NEXT: notb %dl
+; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK26-NEXT: orl %ecx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi
+; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx
+; FALLBACK26-NEXT: orl %ebx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi
+; FALLBACK26-NEXT: movl 40(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx
+; FALLBACK26-NEXT: movl %eax, %ecx
+; FALLBACK26-NEXT: orl %ebx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK26-NEXT: shlxl %edx, %ebx, %eax
+; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK26-NEXT: shrxl %ecx, %ebx, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl %ecx, %eax
+; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %ebx, %ebx
+; FALLBACK26-NEXT: shlxl %edx, %ebx, %ebx
+; FALLBACK26-NEXT: orl %ebp, %ebx
+; FALLBACK26-NEXT: shrxl %ecx, %esi, %ecx
+; FALLBACK26-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK26-NEXT: sarxl %eax, %edi, %eax
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK26-NEXT: orl %ecx, %edi
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %esi, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT: movl %eax, 28(%edx)
+; FALLBACK26-NEXT: movl %ecx, 4(%edx)
+; FALLBACK26-NEXT: movl %edi, 24(%edx)
+; FALLBACK26-NEXT: movl %ebx, 16(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 20(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 8(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 12(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, (%edx)
+; FALLBACK26-NEXT: addl $108, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: ashr_32bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $108, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK27-NEXT: movl 16(%ecx), %esi
+; FALLBACK27-NEXT: movl 20(%ecx), %edi
+; FALLBACK27-NEXT: movl 24(%ecx), %ebx
+; FALLBACK27-NEXT: movl 28(%ecx), %edx
+; FALLBACK27-NEXT: movzbl (%eax), %eax
+; FALLBACK27-NEXT: movl %eax, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: sarl $31, %edx
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: andb $28, %al
+; FALLBACK27-NEXT: movzbl %al, %ebx
+; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, %edi
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl %ebx, 4(%eax)
+; FALLBACK27-NEXT: movl %ebp, 24(%eax)
+; FALLBACK27-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK27-NEXT: movl %ebx, 28(%eax)
+; FALLBACK27-NEXT: movl %esi, 16(%eax)
+; FALLBACK27-NEXT: movl %edi, 20(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: movl %esi, 8(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: movl %esi, 12(%eax)
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, (%eax)
+; FALLBACK27-NEXT: addl $108, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: ashr_32bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $108, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK28-NEXT: movl 16(%ecx), %esi
+; FALLBACK28-NEXT: movl 20(%ecx), %edi
+; FALLBACK28-NEXT: movl 24(%ecx), %ebx
+; FALLBACK28-NEXT: movl 28(%ecx), %edx
+; FALLBACK28-NEXT: movzbl (%eax), %eax
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shlb $3, %cl
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: sarl $31, %edx
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $28, %al
+; FALLBACK28-NEXT: movzbl %al, %edi
+; FALLBACK28-NEXT: movl 32(%esp,%edi), %eax
+; FALLBACK28-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: movl %ecx, %edx
+; FALLBACK28-NEXT: movb %cl, %dh
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %eax, %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %eax
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: movl 48(%esp,%edi), %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %eax, %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK28-NEXT: movl %esi, %eax
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %eax, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %eax
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %eax, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: addl %ebp, %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %eax, %ebp
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl 60(%esp,%edi), %eax
+; FALLBACK28-NEXT: leal (%eax,%eax), %edi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl %ebx, %edi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: sarl %cl, %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: movl %eax, 28(%ecx)
+; FALLBACK28-NEXT: movl %esi, 4(%ecx)
+; FALLBACK28-NEXT: movl %edi, 24(%ecx)
+; FALLBACK28-NEXT: movl %ebp, 16(%ecx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, 20(%ecx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, 8(%ecx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, 12(%ecx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, (%ecx)
+; FALLBACK28-NEXT: addl $108, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: ashr_32bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $108, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK29-NEXT: movl 16(%ecx), %esi
+; FALLBACK29-NEXT: movl 20(%ecx), %edi
+; FALLBACK29-NEXT: movl 24(%ecx), %ebx
+; FALLBACK29-NEXT: movl 28(%ecx), %edx
+; FALLBACK29-NEXT: movzbl (%eax), %eax
+; FALLBACK29-NEXT: movl %eax, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: sarl $31, %edx
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: andb $28, %al
+; FALLBACK29-NEXT: movzbl %al, %ebp
+; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl %edi, %esi
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %esi, 4(%ebp)
+; FALLBACK29-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT: sarl %cl, %eax
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %edx, (%ebp)
+; FALLBACK29-NEXT: addl $108, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: ashr_32bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $108, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT: movl 16(%ecx), %esi
+; FALLBACK30-NEXT: movl 20(%ecx), %edi
+; FALLBACK30-NEXT: movl 24(%ecx), %ebx
+; FALLBACK30-NEXT: movl 28(%ecx), %edx
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %eax
+; FALLBACK30-NEXT: shlb $3, %al
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: sarl $31, %edx
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: andb $28, %cl
+; FALLBACK30-NEXT: movzbl %cl, %edi
+; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %eax, %edx
+; FALLBACK30-NEXT: notb %dl
+; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK30-NEXT: orl %ecx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi
+; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx
+; FALLBACK30-NEXT: orl %ebx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi
+; FALLBACK30-NEXT: movl 40(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx
+; FALLBACK30-NEXT: movl %eax, %ecx
+; FALLBACK30-NEXT: orl %ebx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK30-NEXT: shlxl %edx, %ebx, %eax
+; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK30-NEXT: shrxl %ecx, %ebx, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl %ecx, %eax
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %ebx, %ebx
+; FALLBACK30-NEXT: shlxl %edx, %ebx, %ebx
+; FALLBACK30-NEXT: orl %ebp, %ebx
+; FALLBACK30-NEXT: shrxl %ecx, %esi, %ecx
+; FALLBACK30-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK30-NEXT: sarxl %eax, %edi, %eax
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK30-NEXT: orl %ecx, %edi
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %esi, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT: movl %eax, 28(%edx)
+; FALLBACK30-NEXT: movl %ecx, 4(%edx)
+; FALLBACK30-NEXT: movl %edi, 24(%edx)
+; FALLBACK30-NEXT: movl %ebx, 16(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 20(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 8(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 12(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, (%edx)
+; FALLBACK30-NEXT: addl $108, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: ashr_32bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $108, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK31-NEXT: movl 16(%ecx), %esi
+; FALLBACK31-NEXT: movl 20(%ecx), %edi
+; FALLBACK31-NEXT: movl 24(%ecx), %ebx
+; FALLBACK31-NEXT: movl 28(%ecx), %edx
+; FALLBACK31-NEXT: movzbl (%eax), %eax
+; FALLBACK31-NEXT: movl %eax, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: sarl $31, %edx
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: andb $28, %al
+; FALLBACK31-NEXT: movzbl %al, %ebx
+; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, %edi
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl %ebx, 4(%eax)
+; FALLBACK31-NEXT: movl %ebp, 24(%eax)
+; FALLBACK31-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK31-NEXT: movl %ebx, 28(%eax)
+; FALLBACK31-NEXT: movl %esi, 16(%eax)
+; FALLBACK31-NEXT: movl %edi, 20(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: movl %esi, 8(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: movl %esi, 12(%eax)
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, (%eax)
+; FALLBACK31-NEXT: addl $108, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: retl
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; FALLBACK0-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rcx
+; FALLBACK0-NEXT: movq 8(%rdi), %r8
+; FALLBACK0-NEXT: movq 16(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: movl %esi, %eax
+; FALLBACK0-NEXT: shlb $5, %al
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: sarq $63, %rdi
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: andb $6, %sil
+; FALLBACK0-NEXT: movzbl %sil, %r9d
+; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi
+; FALLBACK0-NEXT: movq %rdi, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: orq %r11, %r8
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: addq %rdi, %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rdi
+; FALLBACK0-NEXT: orq %r10, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK0-NEXT: leaq (%r9,%r9), %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %rbx, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: sarq %cl, %r9
+; FALLBACK0-NEXT: movq %r9, 24(%rdx)
+; FALLBACK0-NEXT: movq %r10, 16(%rdx)
+; FALLBACK0-NEXT: movq %rdi, (%rdx)
+; FALLBACK0-NEXT: movq %r8, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq (%rdi), %rax
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: movl %esi, %ecx
+; FALLBACK1-NEXT: shlb $5, %cl
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: sarq $63, %rdi
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: andb $6, %sil
+; FALLBACK1-NEXT: movzbl %sil, %eax
+; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi
+; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi
+; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8
+; FALLBACK1-NEXT: movq %r8, %r9
+; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax
+; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK1-NEXT: sarq %cl, %rax
+; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rax, 24(%rdx)
+; FALLBACK1-NEXT: movq %rdi, (%rdx)
+; FALLBACK1-NEXT: movq %r9, 8(%rdx)
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: movl %esi, %eax
+; FALLBACK2-NEXT: shlb $5, %al
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: sarq $63, %rdi
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: andb $6, %sil
+; FALLBACK2-NEXT: movzbl %sil, %ecx
+; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi
+; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi
+; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9
+; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: addq %rdi, %rdi
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r8, %rdi
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r9, %rsi
+; FALLBACK2-NEXT: addq %rcx, %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, 24(%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rsi, (%rdx)
+; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq (%rdi), %rax
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: movl %esi, %ecx
+; FALLBACK3-NEXT: shlb $5, %cl
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: sarq $63, %rdi
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: andb $6, %sil
+; FALLBACK3-NEXT: movzbl %sil, %eax
+; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi
+; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi
+; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8
+; FALLBACK3-NEXT: movq %r8, %r9
+; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax
+; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rax, 24(%rdx)
+; FALLBACK3-NEXT: movq %rdi, (%rdx)
+; FALLBACK3-NEXT: movq %r9, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movq 16(%rdi), %rcx
+; FALLBACK4-NEXT: movq 24(%rdi), %rdi
+; FALLBACK4-NEXT: movzbl (%rsi), %esi
+; FALLBACK4-NEXT: movl %esi, %eax
+; FALLBACK4-NEXT: shlb $5, %al
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: sarq $63, %rdi
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: andb $6, %sil
+; FALLBACK4-NEXT: movzbl %sil, %r9d
+; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rdi
+; FALLBACK4-NEXT: orq %r10, %rdi
+; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10
+; FALLBACK4-NEXT: movq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r11, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r8, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: sarq %cl, %r9
+; FALLBACK4-NEXT: movq %r9, 24(%rdx)
+; FALLBACK4-NEXT: movq %r10, 8(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT: movq %rdi, (%rdx)
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movq 16(%rdi), %rax
+; FALLBACK5-NEXT: movq 24(%rdi), %rdi
+; FALLBACK5-NEXT: movzbl (%rsi), %esi
+; FALLBACK5-NEXT: movl %esi, %ecx
+; FALLBACK5-NEXT: shlb $5, %cl
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: sarq $63, %rdi
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: andb $6, %sil
+; FALLBACK5-NEXT: movzbl %sil, %eax
+; FALLBACK5-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK5-NEXT: movq %rdi, %r8
+; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK5-NEXT: movq %rax, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK5-NEXT: sarq %cl, %rsi
+; FALLBACK5-NEXT: movq %r10, 8(%rdx)
+; FALLBACK5-NEXT: movq %r8, 16(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movq 16(%rdi), %rcx
+; FALLBACK6-NEXT: movq 24(%rdi), %rdi
+; FALLBACK6-NEXT: movzbl (%rsi), %esi
+; FALLBACK6-NEXT: movl %esi, %eax
+; FALLBACK6-NEXT: shlb $5, %al
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: sarq $63, %rdi
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: andb $6, %sil
+; FALLBACK6-NEXT: movzbl %sil, %ecx
+; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8
+; FALLBACK6-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT: orq %r9, %rcx
+; FALLBACK6-NEXT: addq %r8, %r8
+; FALLBACK6-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, (%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movq 16(%rdi), %rax
+; FALLBACK7-NEXT: movq 24(%rdi), %rdi
+; FALLBACK7-NEXT: movzbl (%rsi), %esi
+; FALLBACK7-NEXT: movl %esi, %ecx
+; FALLBACK7-NEXT: shlb $5, %cl
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: sarq $63, %rdi
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: andb $6, %sil
+; FALLBACK7-NEXT: movzbl %sil, %eax
+; FALLBACK7-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK7-NEXT: movq %rdi, %r8
+; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK7-NEXT: movq %rax, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax
+; FALLBACK7-NEXT: movq %r10, 8(%rdx)
+; FALLBACK7-NEXT: movq %r8, 16(%rdx)
+; FALLBACK7-NEXT: movq %rax, 24(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK8-NEXT: movq 16(%rdi), %rcx
+; FALLBACK8-NEXT: movq 24(%rdi), %rdi
+; FALLBACK8-NEXT: movzbl (%rsi), %esi
+; FALLBACK8-NEXT: movl %esi, %eax
+; FALLBACK8-NEXT: shlb $5, %al
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: sarq $63, %rdi
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: andb $6, %sil
+; FALLBACK8-NEXT: movzbl %sil, %r9d
+; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rdi
+; FALLBACK8-NEXT: orq %r10, %rdi
+; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10
+; FALLBACK8-NEXT: movq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r11, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r8, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: sarq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, 24(%rdx)
+; FALLBACK8-NEXT: movq %r10, 8(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT: movq %rdi, (%rdx)
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK9-NEXT: movq 16(%rdi), %rax
+; FALLBACK9-NEXT: movq 24(%rdi), %rdi
+; FALLBACK9-NEXT: movzbl (%rsi), %esi
+; FALLBACK9-NEXT: movl %esi, %ecx
+; FALLBACK9-NEXT: shlb $5, %cl
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: sarq $63, %rdi
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: andb $6, %sil
+; FALLBACK9-NEXT: movzbl %sil, %eax
+; FALLBACK9-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK9-NEXT: movq %rdi, %r8
+; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK9-NEXT: movq %rax, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK9-NEXT: sarq %cl, %rsi
+; FALLBACK9-NEXT: movq %r10, 8(%rdx)
+; FALLBACK9-NEXT: movq %r8, 16(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK10-NEXT: movq 16(%rdi), %rcx
+; FALLBACK10-NEXT: movq 24(%rdi), %rdi
+; FALLBACK10-NEXT: movzbl (%rsi), %esi
+; FALLBACK10-NEXT: movl %esi, %eax
+; FALLBACK10-NEXT: shlb $5, %al
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: sarq $63, %rdi
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: andb $6, %sil
+; FALLBACK10-NEXT: movzbl %sil, %ecx
+; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8
+; FALLBACK10-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT: orq %r9, %rcx
+; FALLBACK10-NEXT: addq %r8, %r8
+; FALLBACK10-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, (%rdx)
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK11-NEXT: movq 16(%rdi), %rax
+; FALLBACK11-NEXT: movq 24(%rdi), %rdi
+; FALLBACK11-NEXT: movzbl (%rsi), %esi
+; FALLBACK11-NEXT: movl %esi, %ecx
+; FALLBACK11-NEXT: shlb $5, %cl
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: sarq $63, %rdi
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: andb $6, %sil
+; FALLBACK11-NEXT: movzbl %sil, %eax
+; FALLBACK11-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK11-NEXT: movq %rdi, %r8
+; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK11-NEXT: movq %rax, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax
+; FALLBACK11-NEXT: movq %r10, 8(%rdx)
+; FALLBACK11-NEXT: movq %r8, 16(%rdx)
+; FALLBACK11-NEXT: movq %rax, 24(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK12-NEXT: movq 16(%rdi), %rcx
+; FALLBACK12-NEXT: movq 24(%rdi), %rdi
+; FALLBACK12-NEXT: movzbl (%rsi), %esi
+; FALLBACK12-NEXT: movl %esi, %eax
+; FALLBACK12-NEXT: shlb $5, %al
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: sarq $63, %rdi
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: andb $6, %sil
+; FALLBACK12-NEXT: movzbl %sil, %r9d
+; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rdi
+; FALLBACK12-NEXT: orq %r10, %rdi
+; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10
+; FALLBACK12-NEXT: movq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r11, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r8, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: sarq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, 24(%rdx)
+; FALLBACK12-NEXT: movq %r10, 8(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT: movq %rdi, (%rdx)
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK13-NEXT: movq 16(%rdi), %rax
+; FALLBACK13-NEXT: movq 24(%rdi), %rdi
+; FALLBACK13-NEXT: movzbl (%rsi), %esi
+; FALLBACK13-NEXT: movl %esi, %ecx
+; FALLBACK13-NEXT: shlb $5, %cl
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: sarq $63, %rdi
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: andb $6, %sil
+; FALLBACK13-NEXT: movzbl %sil, %eax
+; FALLBACK13-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK13-NEXT: movq %rdi, %r8
+; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK13-NEXT: movq %rax, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK13-NEXT: sarq %cl, %rsi
+; FALLBACK13-NEXT: movq %r10, 8(%rdx)
+; FALLBACK13-NEXT: movq %r8, 16(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK14-NEXT: movq 16(%rdi), %rcx
+; FALLBACK14-NEXT: movq 24(%rdi), %rdi
+; FALLBACK14-NEXT: movzbl (%rsi), %esi
+; FALLBACK14-NEXT: movl %esi, %eax
+; FALLBACK14-NEXT: shlb $5, %al
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: sarq $63, %rdi
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: andb $6, %sil
+; FALLBACK14-NEXT: movzbl %sil, %ecx
+; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8
+; FALLBACK14-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: addq %rcx, %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT: orq %r9, %rcx
+; FALLBACK14-NEXT: addq %r8, %r8
+; FALLBACK14-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, (%rdx)
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK15-NEXT: movq 16(%rdi), %rax
+; FALLBACK15-NEXT: movq 24(%rdi), %rdi
+; FALLBACK15-NEXT: movzbl (%rsi), %esi
+; FALLBACK15-NEXT: movl %esi, %ecx
+; FALLBACK15-NEXT: shlb $5, %cl
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: sarq $63, %rdi
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: andb $6, %sil
+; FALLBACK15-NEXT: movzbl %sil, %eax
+; FALLBACK15-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK15-NEXT: movq %rdi, %r8
+; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK15-NEXT: movq %rax, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax
+; FALLBACK15-NEXT: movq %r10, 8(%rdx)
+; FALLBACK15-NEXT: movq %r8, 16(%rdx)
+; FALLBACK15-NEXT: movq %rax, 24(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: retq
+;
+; X86-SSE2-LABEL: ashr_32bytes_dwordOff:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: pushl %ebx
+; X86-SSE2-NEXT: pushl %edi
+; X86-SSE2-NEXT: pushl %esi
+; X86-SSE2-NEXT: subl $92, %esp
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl (%eax), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 4(%eax), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 8(%eax), %edi
+; X86-SSE2-NEXT: movl 12(%eax), %ebx
+; X86-SSE2-NEXT: movl 16(%eax), %ebp
+; X86-SSE2-NEXT: movl 20(%eax), %esi
+; X86-SSE2-NEXT: movl 24(%eax), %edx
+; X86-SSE2-NEXT: movl 28(%eax), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movzbl (%eax), %eax
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: sarl $31, %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: andl $7, %eax
+; X86-SSE2-NEXT: movl 16(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 20(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 28(%esp,%eax,4), %esi
+; X86-SSE2-NEXT: movl 24(%esp,%eax,4), %edi
+; X86-SSE2-NEXT: movl 36(%esp,%eax,4), %ebx
+; X86-SSE2-NEXT: movl 32(%esp,%eax,4), %ebp
+; X86-SSE2-NEXT: movl 44(%esp,%eax,4), %edx
+; X86-SSE2-NEXT: movl 40(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl %ecx, 24(%eax)
+; X86-SSE2-NEXT: movl %edx, 28(%eax)
+; X86-SSE2-NEXT: movl %ebp, 16(%eax)
+; X86-SSE2-NEXT: movl %ebx, 20(%eax)
+; X86-SSE2-NEXT: movl %edi, 8(%eax)
+; X86-SSE2-NEXT: movl %esi, 12(%eax)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl %ecx, (%eax)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl %ecx, 4(%eax)
+; X86-SSE2-NEXT: addl $92, %esp
+; X86-SSE2-NEXT: popl %esi
+; X86-SSE2-NEXT: popl %edi
+; X86-SSE2-NEXT: popl %ebx
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: ashr_32bytes_dwordOff:
+; X86-SSE42: # %bb.0:
+; X86-SSE42-NEXT: pushl %ebx
+; X86-SSE42-NEXT: pushl %edi
+; X86-SSE42-NEXT: pushl %esi
+; X86-SSE42-NEXT: subl $64, %esp
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE42-NEXT: movups (%edx), %xmm0
+; X86-SSE42-NEXT: movl 16(%edx), %esi
+; X86-SSE42-NEXT: movl 20(%edx), %edi
+; X86-SSE42-NEXT: movl 24(%edx), %ebx
+; X86-SSE42-NEXT: movl 28(%edx), %edx
+; X86-SSE42-NEXT: movzbl (%ecx), %ecx
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, (%esp)
+; X86-SSE42-NEXT: sarl $31, %edx
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: andl $7, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0
+; X86-SSE42-NEXT: movups 16(%esp,%ecx,4), %xmm1
+; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
+; X86-SSE42-NEXT: movups %xmm0, (%eax)
; X86-SSE42-NEXT: addl $64, %esp
+; X86-SSE42-NEXT: popl %esi
+; X86-SSE42-NEXT: popl %edi
+; X86-SSE42-NEXT: popl %ebx
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: shl_32bytes:
+; X86-AVX-LABEL: ashr_32bytes_dwordOff:
; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: pushl %ebx
+; X86-AVX-NEXT: pushl %edi
+; X86-AVX-NEXT: pushl %esi
; X86-AVX-NEXT: subl $64, %esp
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: vmovups (%edx), %ymm0
+; X86-AVX-NEXT: vmovups (%edx), %xmm0
+; X86-AVX-NEXT: movl 16(%edx), %esi
+; X86-AVX-NEXT: movl 20(%edx), %edi
+; X86-AVX-NEXT: movl 24(%edx), %ebx
+; X86-AVX-NEXT: movl 28(%edx), %edx
; X86-AVX-NEXT: movzbl (%ecx), %ecx
-; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT: vmovups %ymm1, (%esp)
-; X86-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andb $31, %cl
-; X86-AVX-NEXT: negb %cl
-; X86-AVX-NEXT: movsbl %cl, %ecx
-; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm0
-; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm1
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovaps %xmm0, (%esp)
+; X86-AVX-NEXT: sarl $31, %edx
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: andl $7, %ecx
+; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0
+; X86-AVX-NEXT: vmovups 16(%esp,%ecx,4), %xmm1
; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
; X86-AVX-NEXT: vmovups %xmm0, (%eax)
; X86-AVX-NEXT: addl $64, %esp
-; X86-AVX-NEXT: vzeroupper
+; X86-AVX-NEXT: popl %esi
+; X86-AVX-NEXT: popl %edi
+; X86-AVX-NEXT: popl %ebx
; X86-AVX-NEXT: retl
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
- %res = shl i256 %src, %bitOff
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 5
+ %res = ashr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
-define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: ashr_32bytes:
+
+define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: ashr_32bytes_qwordOff:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movq (%rdi), %rax
; X64-SSE2-NEXT: movq 8(%rdi), %rcx
@@ -1446,18 +11832,18 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $31, %esi
-; X64-SSE2-NEXT: movq -64(%rsp,%rsi), %rax
-; X64-SSE2-NEXT: movq -56(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi
+; X64-SSE2-NEXT: andl $3, %esi
+; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT: movq -64(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT: movq -56(%rsp,%rsi,8), %rsi
; X64-SSE2-NEXT: movq %rsi, 16(%rdx)
; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-SSE2-NEXT: movq %rax, (%rdx)
; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
; X64-SSE2-NEXT: retq
;
-; X64-SSE42-LABEL: ashr_32bytes:
+; X64-SSE42-LABEL: ashr_32bytes_qwordOff:
; X64-SSE42: # %bb.0:
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movq 16(%rdi), %rax
@@ -1465,20 +11851,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE42-NEXT: movzbl (%rsi), %esi
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: sarq $63, %rcx
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $31, %esi
-; X64-SSE42-NEXT: movups -64(%rsp,%rsi), %xmm0
-; X64-SSE42-NEXT: movups -48(%rsp,%rsi), %xmm1
+; X64-SSE42-NEXT: andl $3, %esi
+; X64-SSE42-NEXT: movups -72(%rsp,%rsi,8), %xmm0
+; X64-SSE42-NEXT: movups -56(%rsp,%rsi,8), %xmm1
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
; X64-SSE42-NEXT: retq
;
-; X64-AVX-LABEL: ashr_32bytes:
+; X64-AVX-LABEL: ashr_32bytes_qwordOff:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovups (%rdi), %xmm0
; X64-AVX-NEXT: movq 16(%rdi), %rax
@@ -1486,31 +11872,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-AVX-NEXT: movzbl (%rsi), %esi
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: sarq $63, %rcx
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: andl $31, %esi
-; X64-AVX-NEXT: vmovups -64(%rsp,%rsi), %xmm0
-; X64-AVX-NEXT: vmovups -48(%rsp,%rsi), %xmm1
+; X64-AVX-NEXT: andl $3, %esi
+; X64-AVX-NEXT: vmovups -72(%rsp,%rsi,8), %xmm0
+; X64-AVX-NEXT: vmovups -56(%rsp,%rsi,8), %xmm1
; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
; X64-AVX-NEXT: retq
;
-; X86-SSE2-LABEL: ashr_32bytes:
+; X86-SSE2-LABEL: ashr_32bytes_qwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $72, %esp
+; X86-SSE2-NEXT: subl $92, %esp
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl (%eax), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 4(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 8(%eax), %edi
; X86-SSE2-NEXT: movl 12(%eax), %ebx
; X86-SSE2-NEXT: movl 16(%eax), %ebp
@@ -1525,7 +11911,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -1538,17 +11924,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $31, %eax
-; X86-SSE2-NEXT: movl 8(%esp,%eax), %ecx
+; X86-SSE2-NEXT: andl $3, %eax
+; X86-SSE2-NEXT: movl 16(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 12(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 20(%esp,%eax), %esi
-; X86-SSE2-NEXT: movl 16(%esp,%eax), %edi
-; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebx
-; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebp
-; X86-SSE2-NEXT: movl 36(%esp,%eax), %edx
-; X86-SSE2-NEXT: movl 32(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 20(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 28(%esp,%eax,8), %esi
+; X86-SSE2-NEXT: movl 24(%esp,%eax,8), %edi
+; X86-SSE2-NEXT: movl 36(%esp,%eax,8), %ebx
+; X86-SSE2-NEXT: movl 32(%esp,%eax,8), %ebp
+; X86-SSE2-NEXT: movl 44(%esp,%eax,8), %edx
+; X86-SSE2-NEXT: movl 40(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl %ecx, 24(%eax)
; X86-SSE2-NEXT: movl %edx, 28(%eax)
@@ -1558,16 +11944,16 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %esi, 12(%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, (%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $72, %esp
+; X86-SSE2-NEXT: addl $92, %esp
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: ashr_32bytes:
+; X86-SSE42-LABEL: ashr_32bytes_qwordOff:
; X86-SSE42: # %bb.0:
; X86-SSE42-NEXT: pushl %ebx
; X86-SSE42-NEXT: pushl %edi
@@ -1586,7 +11972,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
+; X86-SSE42-NEXT: movaps %xmm0, (%esp)
; X86-SSE42-NEXT: sarl $31, %edx
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -1596,9 +11982,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andl $31, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
+; X86-SSE42-NEXT: andl $3, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0
+; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1
; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
; X86-SSE42-NEXT: movups %xmm0, (%eax)
; X86-SSE42-NEXT: addl $64, %esp
@@ -1607,7 +11993,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: popl %ebx
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: ashr_32bytes:
+; X86-AVX-LABEL: ashr_32bytes_qwordOff:
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: pushl %ebx
; X86-AVX-NEXT: pushl %edi
@@ -1626,7 +12012,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovups %xmm0, (%esp)
+; X86-AVX-NEXT: vmovaps %xmm0, (%esp)
; X86-AVX-NEXT: sarl $31, %edx
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -1636,9 +12022,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andl $31, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1
+; X86-AVX-NEXT: andl $3, %ecx
+; X86-AVX-NEXT: vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
; X86-AVX-NEXT: vmovups %xmm0, (%eax)
; X86-AVX-NEXT: addl $64, %esp
@@ -1647,15 +12033,3662 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: popl %ebx
; X86-AVX-NEXT: retl
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %qwordOff = load i256, ptr %qwordOff.ptr, align 1
+ %bitOff = shl i256 %qwordOff, 6
%res = ashr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: lshr_64bytes:
+; FALLBACK0-LABEL: lshr_64bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %r15
+; FALLBACK0-NEXT: pushq %r14
+; FALLBACK0-NEXT: pushq %r13
+; FALLBACK0-NEXT: pushq %r12
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rax
+; FALLBACK0-NEXT: movq 8(%rdi), %rcx
+; FALLBACK0-NEXT: movq 16(%rdi), %r8
+; FALLBACK0-NEXT: movq 24(%rdi), %r9
+; FALLBACK0-NEXT: movq 32(%rdi), %r10
+; FALLBACK0-NEXT: movq 40(%rdi), %r11
+; FALLBACK0-NEXT: movq 48(%rdi), %rbx
+; FALLBACK0-NEXT: movq 56(%rdi), %r14
+; FALLBACK0-NEXT: movl (%rsi), %edi
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: leal (,%rdi,8), %eax
+; FALLBACK0-NEXT: andl $56, %eax
+; FALLBACK0-NEXT: andl $56, %edi
+; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10
+; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8
+; FALLBACK0-NEXT: movq %r8, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r9
+; FALLBACK0-NEXT: orq %r11, %r9
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: addq %r8, %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: orq %r10, %r8
+; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10
+; FALLBACK0-NEXT: movq %r10, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r15
+; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14
+; FALLBACK0-NEXT: leaq (%r14,%r14), %r11
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: orq %r15, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: addq %r10, %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %rbx, %r10
+; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT: movq %rbx, %r12
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r12
+; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13
+; FALLBACK0-NEXT: leaq (%r13,%r13), %r15
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r15
+; FALLBACK0-NEXT: orq %r12, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r14
+; FALLBACK0-NEXT: addq %rbx, %rbx
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rbx
+; FALLBACK0-NEXT: orq %r14, %rbx
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r13
+; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r14
+; FALLBACK0-NEXT: orq %r13, %r14
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rdi
+; FALLBACK0-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK0-NEXT: movq %r14, 48(%rdx)
+; FALLBACK0-NEXT: movq %rbx, 32(%rdx)
+; FALLBACK0-NEXT: movq %r15, 40(%rdx)
+; FALLBACK0-NEXT: movq %r10, 16(%rdx)
+; FALLBACK0-NEXT: movq %r11, 24(%rdx)
+; FALLBACK0-NEXT: movq %r8, (%rdx)
+; FALLBACK0-NEXT: movq %r9, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: popq %r12
+; FALLBACK0-NEXT: popq %r13
+; FALLBACK0-NEXT: popq %r14
+; FALLBACK0-NEXT: popq %r15
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: lshr_64bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: pushq %r15
+; FALLBACK1-NEXT: pushq %r14
+; FALLBACK1-NEXT: pushq %rbx
+; FALLBACK1-NEXT: movq (%rdi), %rcx
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %r10
+; FALLBACK1-NEXT: movq 32(%rdi), %r11
+; FALLBACK1-NEXT: movq 40(%rdi), %rbx
+; FALLBACK1-NEXT: movq 48(%rdi), %r14
+; FALLBACK1-NEXT: movq 56(%rdi), %rdi
+; FALLBACK1-NEXT: movl (%rsi), %eax
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: leal (,%rax,8), %ecx
+; FALLBACK1-NEXT: andl $56, %ecx
+; FALLBACK1-NEXT: andl $56, %eax
+; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9
+; FALLBACK1-NEXT: movq %r9, %r8
+; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8
+; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11
+; FALLBACK1-NEXT: movq %r11, %rbx
+; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx
+; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11
+; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK1-NEXT: movq %r14, %r15
+; FALLBACK1-NEXT: shrdq %cl, %r11, %r15
+; FALLBACK1-NEXT: shrdq %cl, %r14, %r10
+; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK1-NEXT: shrdq %cl, %rax, %r11
+; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: shrq %cl, %rax
+; FALLBACK1-NEXT: movq %r11, 48(%rdx)
+; FALLBACK1-NEXT: movq %rax, 56(%rdx)
+; FALLBACK1-NEXT: movq %r10, 32(%rdx)
+; FALLBACK1-NEXT: movq %r15, 40(%rdx)
+; FALLBACK1-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK1-NEXT: movq %rsi, (%rdx)
+; FALLBACK1-NEXT: movq %r8, 8(%rdx)
+; FALLBACK1-NEXT: popq %rbx
+; FALLBACK1-NEXT: popq %r14
+; FALLBACK1-NEXT: popq %r15
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: lshr_64bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: pushq %rbp
+; FALLBACK2-NEXT: pushq %r15
+; FALLBACK2-NEXT: pushq %r14
+; FALLBACK2-NEXT: pushq %r13
+; FALLBACK2-NEXT: pushq %r12
+; FALLBACK2-NEXT: pushq %rbx
+; FALLBACK2-NEXT: pushq %rax
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %r10
+; FALLBACK2-NEXT: movq 32(%rdi), %r11
+; FALLBACK2-NEXT: movq 40(%rdi), %rbx
+; FALLBACK2-NEXT: movq 48(%rdi), %r14
+; FALLBACK2-NEXT: movq 56(%rdi), %rdi
+; FALLBACK2-NEXT: movl (%rsi), %eax
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: leal (,%rax,8), %ecx
+; FALLBACK2-NEXT: andl $56, %ecx
+; FALLBACK2-NEXT: andl $56, %eax
+; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi
+; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9
+; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx
+; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13
+; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi
+; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8
+; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11
+; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15
+; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp
+; FALLBACK2-NEXT: movl %ecx, %r12d
+; FALLBACK2-NEXT: notb %r12b
+; FALLBACK2-NEXT: addq %r9, %r9
+; FALLBACK2-NEXT: shlxq %r12, %r9, %r9
+; FALLBACK2-NEXT: orq %rbx, %r9
+; FALLBACK2-NEXT: addq %rdi, %rdi
+; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r13, %rdi
+; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx
+; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13
+; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK2-NEXT: shrxq %rcx, %rax, %rcx
+; FALLBACK2-NEXT: addq %r10, %r10
+; FALLBACK2-NEXT: shlxq %r12, %r10, %r10
+; FALLBACK2-NEXT: orq %r8, %r10
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r11, %rsi
+; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8
+; FALLBACK2-NEXT: shlxq %r12, %r8, %r8
+; FALLBACK2-NEXT: orq %r15, %r8
+; FALLBACK2-NEXT: addq %r14, %r14
+; FALLBACK2-NEXT: shlxq %r12, %r14, %r11
+; FALLBACK2-NEXT: orq %rbp, %r11
+; FALLBACK2-NEXT: addq %rax, %rax
+; FALLBACK2-NEXT: shlxq %r12, %rax, %rax
+; FALLBACK2-NEXT: orq %r13, %rax
+; FALLBACK2-NEXT: movq %rcx, 56(%rdx)
+; FALLBACK2-NEXT: movq %rax, 48(%rdx)
+; FALLBACK2-NEXT: movq %r11, 32(%rdx)
+; FALLBACK2-NEXT: movq %r8, 40(%rdx)
+; FALLBACK2-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK2-NEXT: movq %r10, 24(%rdx)
+; FALLBACK2-NEXT: movq %rdi, (%rdx)
+; FALLBACK2-NEXT: movq %r9, 8(%rdx)
+; FALLBACK2-NEXT: addq $8, %rsp
+; FALLBACK2-NEXT: popq %rbx
+; FALLBACK2-NEXT: popq %r12
+; FALLBACK2-NEXT: popq %r13
+; FALLBACK2-NEXT: popq %r14
+; FALLBACK2-NEXT: popq %r15
+; FALLBACK2-NEXT: popq %rbp
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: lshr_64bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: pushq %r15
+; FALLBACK3-NEXT: pushq %r14
+; FALLBACK3-NEXT: pushq %rbx
+; FALLBACK3-NEXT: movq (%rdi), %rcx
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %r10
+; FALLBACK3-NEXT: movq 32(%rdi), %r11
+; FALLBACK3-NEXT: movq 40(%rdi), %rbx
+; FALLBACK3-NEXT: movq 48(%rdi), %r14
+; FALLBACK3-NEXT: movq 56(%rdi), %rdi
+; FALLBACK3-NEXT: movl (%rsi), %eax
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: leal (,%rax,8), %ecx
+; FALLBACK3-NEXT: andl $56, %ecx
+; FALLBACK3-NEXT: andl $56, %eax
+; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9
+; FALLBACK3-NEXT: movq %r9, %r8
+; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8
+; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11
+; FALLBACK3-NEXT: movq %r11, %rbx
+; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx
+; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11
+; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK3-NEXT: movq %r14, %r15
+; FALLBACK3-NEXT: shrdq %cl, %r11, %r15
+; FALLBACK3-NEXT: shrdq %cl, %r14, %r10
+; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK3-NEXT: shrdq %cl, %rax, %r11
+; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi
+; FALLBACK3-NEXT: movq %r11, 48(%rdx)
+; FALLBACK3-NEXT: movq %r10, 32(%rdx)
+; FALLBACK3-NEXT: movq %r15, 40(%rdx)
+; FALLBACK3-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK3-NEXT: movq %rsi, (%rdx)
+; FALLBACK3-NEXT: movq %r8, 8(%rdx)
+; FALLBACK3-NEXT: movq %rax, 56(%rdx)
+; FALLBACK3-NEXT: popq %rbx
+; FALLBACK3-NEXT: popq %r14
+; FALLBACK3-NEXT: popq %r15
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: lshr_64bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbp
+; FALLBACK4-NEXT: pushq %r15
+; FALLBACK4-NEXT: pushq %r14
+; FALLBACK4-NEXT: pushq %r13
+; FALLBACK4-NEXT: pushq %r12
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: pushq %rax
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK4-NEXT: movl (%rsi), %r8d
+; FALLBACK4-NEXT: xorps %xmm4, %xmm4
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: leal (,%r8,8), %eax
+; FALLBACK4-NEXT: andl $56, %eax
+; FALLBACK4-NEXT: andl $56, %r8d
+; FALLBACK4-NEXT: movq -128(%rsp,%r8), %r10
+; FALLBACK4-NEXT: movq -120(%rsp,%r8), %r9
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: leaq (%r9,%r9), %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rdi
+; FALLBACK4-NEXT: orq %r10, %rdi
+; FALLBACK4-NEXT: movq -104(%rsp,%r8), %r10
+; FALLBACK4-NEXT: movq %r10, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbx
+; FALLBACK4-NEXT: movq -96(%rsp,%r8), %r12
+; FALLBACK4-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r11
+; FALLBACK4-NEXT: orq %rbx, %r11
+; FALLBACK4-NEXT: movq -112(%rsp,%r8), %rbx
+; FALLBACK4-NEXT: movq %rbx, %r14
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r14
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r14, %r10
+; FALLBACK4-NEXT: movq -88(%rsp,%r8), %r14
+; FALLBACK4-NEXT: movq %r14, %r13
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r13
+; FALLBACK4-NEXT: movq -80(%rsp,%r8), %rbp
+; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r15
+; FALLBACK4-NEXT: orq %r13, %r15
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r12
+; FALLBACK4-NEXT: addq %r14, %r14
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r14
+; FALLBACK4-NEXT: orq %r12, %r14
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbp
+; FALLBACK4-NEXT: movq -72(%rsp,%r8), %r8
+; FALLBACK4-NEXT: leaq (%r8,%r8), %r12
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r12
+; FALLBACK4-NEXT: orq %rbp, %r12
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r9
+; FALLBACK4-NEXT: addq %rbx, %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r9, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: movq %r8, 56(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK4-NEXT: movq %r12, 48(%rdx)
+; FALLBACK4-NEXT: movq %r14, 32(%rdx)
+; FALLBACK4-NEXT: movq %r15, 40(%rdx)
+; FALLBACK4-NEXT: movq %r10, 16(%rdx)
+; FALLBACK4-NEXT: movq %r11, 24(%rdx)
+; FALLBACK4-NEXT: movq %rdi, (%rdx)
+; FALLBACK4-NEXT: addq $8, %rsp
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: popq %r12
+; FALLBACK4-NEXT: popq %r13
+; FALLBACK4-NEXT: popq %r14
+; FALLBACK4-NEXT: popq %r15
+; FALLBACK4-NEXT: popq %rbp
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: lshr_64bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: pushq %r15
+; FALLBACK5-NEXT: pushq %r14
+; FALLBACK5-NEXT: pushq %rbx
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK5-NEXT: movl (%rsi), %eax
+; FALLBACK5-NEXT: xorps %xmm4, %xmm4
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: andl $56, %ecx
+; FALLBACK5-NEXT: andl $56, %eax
+; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq %r9, %rsi
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK5-NEXT: movq %r10, %r8
+; FALLBACK5-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK5-NEXT: movq %r11, %rbx
+; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK5-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK5-NEXT: movq %rax, %r15
+; FALLBACK5-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: shrq %cl, %r11
+; FALLBACK5-NEXT: movq %r15, 8(%rdx)
+; FALLBACK5-NEXT: movq %r9, 48(%rdx)
+; FALLBACK5-NEXT: movq %r11, 56(%rdx)
+; FALLBACK5-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK5-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK5-NEXT: movq %r8, 16(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r14, (%rdx)
+; FALLBACK5-NEXT: popq %rbx
+; FALLBACK5-NEXT: popq %r14
+; FALLBACK5-NEXT: popq %r15
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: lshr_64bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: pushq %rbp
+; FALLBACK6-NEXT: pushq %r15
+; FALLBACK6-NEXT: pushq %r14
+; FALLBACK6-NEXT: pushq %r13
+; FALLBACK6-NEXT: pushq %r12
+; FALLBACK6-NEXT: pushq %rbx
+; FALLBACK6-NEXT: pushq %rax
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK6-NEXT: movl (%rsi), %eax
+; FALLBACK6-NEXT: xorps %xmm4, %xmm4
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: leal (,%rax,8), %esi
+; FALLBACK6-NEXT: andl $56, %esi
+; FALLBACK6-NEXT: andl $56, %eax
+; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx
+; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12
+; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9
+; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14
+; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15
+; FALLBACK6-NEXT: movl %esi, %ebx
+; FALLBACK6-NEXT: notb %bl
+; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK6-NEXT: orq %r11, %r8
+; FALLBACK6-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK6-NEXT: orq %r12, %r11
+; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13
+; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp
+; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK6-NEXT: shrxq %rsi, %rax, %rsi
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK6-NEXT: orq %r9, %rdi
+; FALLBACK6-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK6-NEXT: orq %r14, %r9
+; FALLBACK6-NEXT: addq %r10, %r10
+; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK6-NEXT: orq %r15, %r10
+; FALLBACK6-NEXT: addq %rax, %rax
+; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK6-NEXT: orq %r13, %rax
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx
+; FALLBACK6-NEXT: orq %rbp, %rcx
+; FALLBACK6-NEXT: movq %rsi, 56(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 8(%rdx)
+; FALLBACK6-NEXT: movq %rax, 48(%rdx)
+; FALLBACK6-NEXT: movq %r10, 32(%rdx)
+; FALLBACK6-NEXT: movq %r9, 40(%rdx)
+; FALLBACK6-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %r8, (%rdx)
+; FALLBACK6-NEXT: addq $8, %rsp
+; FALLBACK6-NEXT: popq %rbx
+; FALLBACK6-NEXT: popq %r12
+; FALLBACK6-NEXT: popq %r13
+; FALLBACK6-NEXT: popq %r14
+; FALLBACK6-NEXT: popq %r15
+; FALLBACK6-NEXT: popq %rbp
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: lshr_64bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: pushq %r15
+; FALLBACK7-NEXT: pushq %r14
+; FALLBACK7-NEXT: pushq %rbx
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK7-NEXT: movl (%rsi), %eax
+; FALLBACK7-NEXT: xorps %xmm4, %xmm4
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: andl $56, %ecx
+; FALLBACK7-NEXT: andl $56, %eax
+; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq %r9, %rsi
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK7-NEXT: movq %r10, %r8
+; FALLBACK7-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK7-NEXT: movq %r11, %rbx
+; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK7-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK7-NEXT: movq %rax, %r15
+; FALLBACK7-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK7-NEXT: shrxq %rcx, %r11, %r10
+; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK7-NEXT: movq %r15, 8(%rdx)
+; FALLBACK7-NEXT: movq %r9, 48(%rdx)
+; FALLBACK7-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK7-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK7-NEXT: movq %r8, 16(%rdx)
+; FALLBACK7-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK7-NEXT: movq %r14, (%rdx)
+; FALLBACK7-NEXT: movq %r10, 56(%rdx)
+; FALLBACK7-NEXT: popq %rbx
+; FALLBACK7-NEXT: popq %r14
+; FALLBACK7-NEXT: popq %r15
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: lshr_64bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbp
+; FALLBACK8-NEXT: pushq %r15
+; FALLBACK8-NEXT: pushq %r14
+; FALLBACK8-NEXT: pushq %r13
+; FALLBACK8-NEXT: pushq %r12
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: pushq %rax
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK8-NEXT: movl (%rsi), %r9d
+; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: leal (,%r9,8), %eax
+; FALLBACK8-NEXT: andl $56, %eax
+; FALLBACK8-NEXT: andl $56, %r9d
+; FALLBACK8-NEXT: movq -128(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq -120(%rsp,%r9), %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rdi
+; FALLBACK8-NEXT: orq %r10, %rdi
+; FALLBACK8-NEXT: movq -104(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq %r10, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbx
+; FALLBACK8-NEXT: movq -96(%rsp,%r9), %r12
+; FALLBACK8-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r11
+; FALLBACK8-NEXT: orq %rbx, %r11
+; FALLBACK8-NEXT: movq -112(%rsp,%r9), %rbx
+; FALLBACK8-NEXT: movq %rbx, %r14
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r14
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r14, %r10
+; FALLBACK8-NEXT: movq -88(%rsp,%r9), %r14
+; FALLBACK8-NEXT: movq %r14, %r13
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r13
+; FALLBACK8-NEXT: movq -80(%rsp,%r9), %rbp
+; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r15
+; FALLBACK8-NEXT: orq %r13, %r15
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r12
+; FALLBACK8-NEXT: addq %r14, %r14
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r14
+; FALLBACK8-NEXT: orq %r12, %r14
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbp
+; FALLBACK8-NEXT: movq -72(%rsp,%r9), %r9
+; FALLBACK8-NEXT: leaq (%r9,%r9), %r12
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r12
+; FALLBACK8-NEXT: orq %rbp, %r12
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: addq %rbx, %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r8, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, 56(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK8-NEXT: movq %r12, 48(%rdx)
+; FALLBACK8-NEXT: movq %r14, 32(%rdx)
+; FALLBACK8-NEXT: movq %r15, 40(%rdx)
+; FALLBACK8-NEXT: movq %r10, 16(%rdx)
+; FALLBACK8-NEXT: movq %r11, 24(%rdx)
+; FALLBACK8-NEXT: movq %rdi, (%rdx)
+; FALLBACK8-NEXT: addq $8, %rsp
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: popq %r12
+; FALLBACK8-NEXT: popq %r13
+; FALLBACK8-NEXT: popq %r14
+; FALLBACK8-NEXT: popq %r15
+; FALLBACK8-NEXT: popq %rbp
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: lshr_64bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: pushq %r15
+; FALLBACK9-NEXT: pushq %r14
+; FALLBACK9-NEXT: pushq %rbx
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK9-NEXT: movl (%rsi), %eax
+; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: andl $56, %ecx
+; FALLBACK9-NEXT: andl $56, %eax
+; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq %r9, %rsi
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK9-NEXT: movq %r10, %r8
+; FALLBACK9-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK9-NEXT: movq %r11, %rbx
+; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK9-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK9-NEXT: movq %rax, %r15
+; FALLBACK9-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: shrq %cl, %r11
+; FALLBACK9-NEXT: movq %r15, 8(%rdx)
+; FALLBACK9-NEXT: movq %r9, 48(%rdx)
+; FALLBACK9-NEXT: movq %r11, 56(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK9-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK9-NEXT: movq %r8, 16(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r14, (%rdx)
+; FALLBACK9-NEXT: popq %rbx
+; FALLBACK9-NEXT: popq %r14
+; FALLBACK9-NEXT: popq %r15
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: lshr_64bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: pushq %rbp
+; FALLBACK10-NEXT: pushq %r15
+; FALLBACK10-NEXT: pushq %r14
+; FALLBACK10-NEXT: pushq %r13
+; FALLBACK10-NEXT: pushq %r12
+; FALLBACK10-NEXT: pushq %rbx
+; FALLBACK10-NEXT: pushq %rax
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK10-NEXT: movl (%rsi), %eax
+; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: leal (,%rax,8), %esi
+; FALLBACK10-NEXT: andl $56, %esi
+; FALLBACK10-NEXT: andl $56, %eax
+; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx
+; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12
+; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9
+; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14
+; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15
+; FALLBACK10-NEXT: movl %esi, %ebx
+; FALLBACK10-NEXT: notb %bl
+; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK10-NEXT: orq %r11, %r8
+; FALLBACK10-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK10-NEXT: orq %r12, %r11
+; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13
+; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp
+; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK10-NEXT: shrxq %rsi, %rax, %rsi
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK10-NEXT: orq %r9, %rdi
+; FALLBACK10-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK10-NEXT: orq %r14, %r9
+; FALLBACK10-NEXT: addq %r10, %r10
+; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK10-NEXT: orq %r15, %r10
+; FALLBACK10-NEXT: addq %rax, %rax
+; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK10-NEXT: orq %r13, %rax
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx
+; FALLBACK10-NEXT: orq %rbp, %rcx
+; FALLBACK10-NEXT: movq %rsi, 56(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 8(%rdx)
+; FALLBACK10-NEXT: movq %rax, 48(%rdx)
+; FALLBACK10-NEXT: movq %r10, 32(%rdx)
+; FALLBACK10-NEXT: movq %r9, 40(%rdx)
+; FALLBACK10-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %r8, (%rdx)
+; FALLBACK10-NEXT: addq $8, %rsp
+; FALLBACK10-NEXT: popq %rbx
+; FALLBACK10-NEXT: popq %r12
+; FALLBACK10-NEXT: popq %r13
+; FALLBACK10-NEXT: popq %r14
+; FALLBACK10-NEXT: popq %r15
+; FALLBACK10-NEXT: popq %rbp
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: lshr_64bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: pushq %r15
+; FALLBACK11-NEXT: pushq %r14
+; FALLBACK11-NEXT: pushq %rbx
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK11-NEXT: movl (%rsi), %eax
+; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: andl $56, %ecx
+; FALLBACK11-NEXT: andl $56, %eax
+; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq %r9, %rsi
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK11-NEXT: movq %r10, %r8
+; FALLBACK11-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK11-NEXT: movq %r11, %rbx
+; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK11-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK11-NEXT: movq %rax, %r15
+; FALLBACK11-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK11-NEXT: shrxq %rcx, %r11, %r10
+; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK11-NEXT: movq %r15, 8(%rdx)
+; FALLBACK11-NEXT: movq %r9, 48(%rdx)
+; FALLBACK11-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK11-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK11-NEXT: movq %r8, 16(%rdx)
+; FALLBACK11-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK11-NEXT: movq %r14, (%rdx)
+; FALLBACK11-NEXT: movq %r10, 56(%rdx)
+; FALLBACK11-NEXT: popq %rbx
+; FALLBACK11-NEXT: popq %r14
+; FALLBACK11-NEXT: popq %r15
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: lshr_64bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbp
+; FALLBACK12-NEXT: pushq %r15
+; FALLBACK12-NEXT: pushq %r14
+; FALLBACK12-NEXT: pushq %r13
+; FALLBACK12-NEXT: pushq %r12
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: pushq %rax
+; FALLBACK12-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK12-NEXT: movl (%rsi), %r9d
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: leal (,%r9,8), %eax
+; FALLBACK12-NEXT: andl $56, %eax
+; FALLBACK12-NEXT: andl $56, %r9d
+; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rdi
+; FALLBACK12-NEXT: orq %r10, %rdi
+; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq %r10, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbx
+; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12
+; FALLBACK12-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r11
+; FALLBACK12-NEXT: orq %rbx, %r11
+; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx
+; FALLBACK12-NEXT: movq %rbx, %r14
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r14
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r14, %r10
+; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14
+; FALLBACK12-NEXT: movq %r14, %r13
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r13
+; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp
+; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r15
+; FALLBACK12-NEXT: orq %r13, %r15
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r12
+; FALLBACK12-NEXT: addq %r14, %r14
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r14
+; FALLBACK12-NEXT: orq %r12, %r14
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbp
+; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9
+; FALLBACK12-NEXT: leaq (%r9,%r9), %r12
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r12
+; FALLBACK12-NEXT: orq %rbp, %r12
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: addq %rbx, %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r8, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, 56(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK12-NEXT: movq %r12, 48(%rdx)
+; FALLBACK12-NEXT: movq %r14, 32(%rdx)
+; FALLBACK12-NEXT: movq %r15, 40(%rdx)
+; FALLBACK12-NEXT: movq %r10, 16(%rdx)
+; FALLBACK12-NEXT: movq %r11, 24(%rdx)
+; FALLBACK12-NEXT: movq %rdi, (%rdx)
+; FALLBACK12-NEXT: addq $8, %rsp
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: popq %r12
+; FALLBACK12-NEXT: popq %r13
+; FALLBACK12-NEXT: popq %r14
+; FALLBACK12-NEXT: popq %r15
+; FALLBACK12-NEXT: popq %rbp
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: lshr_64bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: pushq %r15
+; FALLBACK13-NEXT: pushq %r14
+; FALLBACK13-NEXT: pushq %rbx
+; FALLBACK13-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK13-NEXT: movl (%rsi), %edi
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: leal (,%rdi,8), %ecx
+; FALLBACK13-NEXT: andl $56, %ecx
+; FALLBACK13-NEXT: andl $56, %edi
+; FALLBACK13-NEXT: movq -96(%rsp,%rdi), %rsi
+; FALLBACK13-NEXT: movq -104(%rsp,%rdi), %r9
+; FALLBACK13-NEXT: movq %r9, %rax
+; FALLBACK13-NEXT: shrdq %cl, %rsi, %rax
+; FALLBACK13-NEXT: movq -112(%rsp,%rdi), %r10
+; FALLBACK13-NEXT: movq %r10, %r8
+; FALLBACK13-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK13-NEXT: movq -80(%rsp,%rdi), %r9
+; FALLBACK13-NEXT: movq -88(%rsp,%rdi), %r11
+; FALLBACK13-NEXT: movq %r11, %rbx
+; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK13-NEXT: shrdq %cl, %r11, %rsi
+; FALLBACK13-NEXT: movq -72(%rsp,%rdi), %r11
+; FALLBACK13-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK13-NEXT: movq -128(%rsp,%rdi), %r14
+; FALLBACK13-NEXT: movq -120(%rsp,%rdi), %rdi
+; FALLBACK13-NEXT: movq %rdi, %r15
+; FALLBACK13-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %r14
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: shrq %cl, %r11
+; FALLBACK13-NEXT: movq %r15, 8(%rdx)
+; FALLBACK13-NEXT: movq %r9, 48(%rdx)
+; FALLBACK13-NEXT: movq %r11, 56(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 32(%rdx)
+; FALLBACK13-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK13-NEXT: movq %r8, 16(%rdx)
+; FALLBACK13-NEXT: movq %rax, 24(%rdx)
+; FALLBACK13-NEXT: movq %r14, (%rdx)
+; FALLBACK13-NEXT: popq %rbx
+; FALLBACK13-NEXT: popq %r14
+; FALLBACK13-NEXT: popq %r15
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: lshr_64bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: pushq %rbp
+; FALLBACK14-NEXT: pushq %r15
+; FALLBACK14-NEXT: pushq %r14
+; FALLBACK14-NEXT: pushq %r13
+; FALLBACK14-NEXT: pushq %r12
+; FALLBACK14-NEXT: pushq %rbx
+; FALLBACK14-NEXT: pushq %rax
+; FALLBACK14-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK14-NEXT: movl (%rsi), %esi
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK14-NEXT: andl $56, %ecx
+; FALLBACK14-NEXT: andl $56, %esi
+; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r11
+; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %rax
+; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT: shrxq %rcx, %rdi, %r12
+; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r13
+; FALLBACK14-NEXT: shrxq %rcx, %rax, %r9
+; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %r10
+; FALLBACK14-NEXT: shrxq %rcx, %r10, %r14
+; FALLBACK14-NEXT: shrxq %rcx, %r13, %r15
+; FALLBACK14-NEXT: movl %ecx, %ebx
+; FALLBACK14-NEXT: notb %bl
+; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %rbp
+; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK14-NEXT: orq %r11, %r8
+; FALLBACK14-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK14-NEXT: orq %r12, %r11
+; FALLBACK14-NEXT: movq -80(%rsp,%rsi), %r12
+; FALLBACK14-NEXT: shrxq %rcx, %r12, %r13
+; FALLBACK14-NEXT: shrxq %rcx, %rbp, %rbp
+; FALLBACK14-NEXT: movq -72(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT: shrxq %rcx, %rsi, %rcx
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK14-NEXT: orq %r9, %rdi
+; FALLBACK14-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK14-NEXT: orq %r14, %r9
+; FALLBACK14-NEXT: addq %r10, %r10
+; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK14-NEXT: orq %r15, %r10
+; FALLBACK14-NEXT: addq %rsi, %rsi
+; FALLBACK14-NEXT: shlxq %rbx, %rsi, %rsi
+; FALLBACK14-NEXT: orq %r13, %rsi
+; FALLBACK14-NEXT: addq %rax, %rax
+; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK14-NEXT: orq %rbp, %rax
+; FALLBACK14-NEXT: movq %rcx, 56(%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rsi, 48(%rdx)
+; FALLBACK14-NEXT: movq %r10, 32(%rdx)
+; FALLBACK14-NEXT: movq %r9, 40(%rdx)
+; FALLBACK14-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %r8, (%rdx)
+; FALLBACK14-NEXT: addq $8, %rsp
+; FALLBACK14-NEXT: popq %rbx
+; FALLBACK14-NEXT: popq %r12
+; FALLBACK14-NEXT: popq %r13
+; FALLBACK14-NEXT: popq %r14
+; FALLBACK14-NEXT: popq %r15
+; FALLBACK14-NEXT: popq %rbp
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: lshr_64bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: pushq %r15
+; FALLBACK15-NEXT: pushq %r14
+; FALLBACK15-NEXT: pushq %rbx
+; FALLBACK15-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK15-NEXT: movl (%rsi), %eax
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: andl $56, %ecx
+; FALLBACK15-NEXT: andl $56, %eax
+; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq %r9, %rsi
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK15-NEXT: movq %r10, %r8
+; FALLBACK15-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK15-NEXT: movq %r11, %rbx
+; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK15-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK15-NEXT: movq %rax, %r15
+; FALLBACK15-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10
+; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK15-NEXT: movq %r15, 8(%rdx)
+; FALLBACK15-NEXT: movq %r9, 48(%rdx)
+; FALLBACK15-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK15-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK15-NEXT: movq %r8, 16(%rdx)
+; FALLBACK15-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK15-NEXT: movq %r14, (%rdx)
+; FALLBACK15-NEXT: movq %r10, 56(%rdx)
+; FALLBACK15-NEXT: popq %rbx
+; FALLBACK15-NEXT: popq %r14
+; FALLBACK15-NEXT: popq %r15
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: lshr_64bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $204, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl (%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 16(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 20(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 24(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 28(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 36(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 40(%eax), %ebp
+; FALLBACK16-NEXT: movl 44(%eax), %ebx
+; FALLBACK16-NEXT: movl 48(%eax), %edi
+; FALLBACK16-NEXT: movl 52(%eax), %esi
+; FALLBACK16-NEXT: movl 56(%eax), %edx
+; FALLBACK16-NEXT: movl 60(%eax), %ecx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl (%eax), %eax
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %eax, %esi
+; FALLBACK16-NEXT: andl $60, %esi
+; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK16-NEXT: shll $3, %eax
+; FALLBACK16-NEXT: andl $24, %eax
+; FALLBACK16-NEXT: movl %edx, %edi
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK16-NEXT: movb %al, %ch
+; FALLBACK16-NEXT: notb %ch
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: addl %edx, %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %edi, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK16-NEXT: movl %edx, %ebp
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %ebp, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: addl %edx, %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %ebx, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK16-NEXT: movl %ebx, %ebp
+; FALLBACK16-NEXT: movl %eax, %edx
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: addl %eax, %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %ebp, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK16-NEXT: movl %ebx, %ebp
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %ebp, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %eax, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK16-NEXT: movl %ebx, %ebp
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK16-NEXT: leal (%edx,%edx), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %ebp, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK16-NEXT: movl %edi, %ebp
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %ebp, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: addl %edi, %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %edx, %edi
+; FALLBACK16-NEXT: movl %esi, %edx
+; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK16-NEXT: movl %esi, %ebx
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK16-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %ebx, %ebp
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: addl %esi, %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %ebx, %esi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %eax, %edx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl %ebx, 60(%eax)
+; FALLBACK16-NEXT: movl %edx, 56(%eax)
+; FALLBACK16-NEXT: movl %esi, 48(%eax)
+; FALLBACK16-NEXT: movl %ebp, 52(%eax)
+; FALLBACK16-NEXT: movl %edi, 40(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 44(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 32(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 36(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 24(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 28(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 16(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 20(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 8(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 12(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, (%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 4(%eax)
+; FALLBACK16-NEXT: addl $204, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: lshr_64bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $188, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 12(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 20(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 24(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 28(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 36(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%ecx), %ebp
+; FALLBACK17-NEXT: movl 44(%ecx), %ebx
+; FALLBACK17-NEXT: movl 48(%ecx), %edi
+; FALLBACK17-NEXT: movl 52(%ecx), %esi
+; FALLBACK17-NEXT: movl 56(%ecx), %edx
+; FALLBACK17-NEXT: movl 60(%ecx), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %ecx
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ecx, %ebp
+; FALLBACK17-NEXT: andl $60, %ebp
+; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shll $3, %ecx
+; FALLBACK17-NEXT: andl $24, %ecx
+; FALLBACK17-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %esi
+; FALLBACK17-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl %esi, %edx
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl %edx, 56(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT: shrl %cl, %eax
+; FALLBACK17-NEXT: movl %eax, 60(%ebp)
+; FALLBACK17-NEXT: movl %esi, 48(%ebp)
+; FALLBACK17-NEXT: movl %edi, 52(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 40(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 44(%ebp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 32(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 36(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 24(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 28(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 16(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 20(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 8(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 12(%ebp)
+; FALLBACK17-NEXT: movl %ebx, (%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 4(%ebp)
+; FALLBACK17-NEXT: addl $188, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: lshr_64bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $204, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 12(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 16(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 20(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 24(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 28(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 32(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 36(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 40(%eax), %ebp
+; FALLBACK18-NEXT: movl 44(%eax), %ebx
+; FALLBACK18-NEXT: movl 48(%eax), %edi
+; FALLBACK18-NEXT: movl 52(%eax), %esi
+; FALLBACK18-NEXT: movl 56(%eax), %edx
+; FALLBACK18-NEXT: movl 60(%eax), %ecx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %eax
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %eax, %ecx
+; FALLBACK18-NEXT: leal (,%eax,8), %edx
+; FALLBACK18-NEXT: andl $24, %edx
+; FALLBACK18-NEXT: andl $60, %ecx
+; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK18-NEXT: movl 72(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %edi
+; FALLBACK18-NEXT: movl %edx, %ebx
+; FALLBACK18-NEXT: notb %bl
+; FALLBACK18-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT: orl %edi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: orl %edi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK18-NEXT: leal (%esi,%esi), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK18-NEXT: leal (%esi,%esi), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK18-NEXT: movl %ecx, %edi
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %ecx, %esi
+; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK18-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK18-NEXT: orl %edi, %ecx
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: addl %eax, %eax
+; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK18-NEXT: shrxl %edx, %ebp, %edx
+; FALLBACK18-NEXT: addl %ebp, %ebp
+; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK18-NEXT: orl %eax, %ebx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl %edx, 60(%eax)
+; FALLBACK18-NEXT: movl %ebx, 56(%eax)
+; FALLBACK18-NEXT: movl %edi, 48(%eax)
+; FALLBACK18-NEXT: movl %ecx, 52(%eax)
+; FALLBACK18-NEXT: movl %esi, 40(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 44(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 32(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 36(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 24(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 28(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 16(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, (%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $204, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: lshr_64bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $188, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 12(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 20(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 24(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 28(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 36(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%ecx), %ebp
+; FALLBACK19-NEXT: movl 44(%ecx), %ebx
+; FALLBACK19-NEXT: movl 48(%ecx), %edi
+; FALLBACK19-NEXT: movl 52(%ecx), %esi
+; FALLBACK19-NEXT: movl 56(%ecx), %edx
+; FALLBACK19-NEXT: movl 60(%ecx), %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %ecx
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ecx, %ebp
+; FALLBACK19-NEXT: andl $60, %ebp
+; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shll $3, %ecx
+; FALLBACK19-NEXT: andl $24, %ecx
+; FALLBACK19-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %esi
+; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl %edi, %edx
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl %eax, 56(%ebp)
+; FALLBACK19-NEXT: movl %esi, 48(%ebp)
+; FALLBACK19-NEXT: movl %edx, 52(%ebp)
+; FALLBACK19-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 44(%ebp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 32(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 36(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 24(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 28(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 16(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 20(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 8(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 12(%ebp)
+; FALLBACK19-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, (%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK19-NEXT: movl %eax, 60(%ebp)
+; FALLBACK19-NEXT: addl $188, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: lshr_64bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $204, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK20-NEXT: movl (%eax), %eax
+; FALLBACK20-NEXT: xorps %xmm4, %xmm4
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %eax, %esi
+; FALLBACK20-NEXT: andl $60, %esi
+; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK20-NEXT: shll $3, %eax
+; FALLBACK20-NEXT: andl $24, %eax
+; FALLBACK20-NEXT: movl %edx, %edi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movb %al, %ch
+; FALLBACK20-NEXT: notb %ch
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %edx, %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %edi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK20-NEXT: movl %edx, %ebp
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %edx, %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: addl %eax, %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %eax, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK20-NEXT: leal (%edx,%edx), %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK20-NEXT: movl %edi, %ebp
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: addl %edi, %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl %edx, %edi
+; FALLBACK20-NEXT: movl %esi, %edx
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK20-NEXT: movl %esi, %ebx
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK20-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %eax, %edx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %ebx, 60(%eax)
+; FALLBACK20-NEXT: movl %edx, 56(%eax)
+; FALLBACK20-NEXT: movl %esi, 48(%eax)
+; FALLBACK20-NEXT: movl %ebp, 52(%eax)
+; FALLBACK20-NEXT: movl %edi, 40(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 44(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 32(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 36(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 24(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 28(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 16(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, (%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 4(%eax)
+; FALLBACK20-NEXT: addl $204, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: lshr_64bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $188, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movups (%ecx), %xmm0
+; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK21-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK21-NEXT: movl (%eax), %ecx
+; FALLBACK21-NEXT: xorps %xmm4, %xmm4
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %ecx, %ebp
+; FALLBACK21-NEXT: andl $60, %ebp
+; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shll $3, %ecx
+; FALLBACK21-NEXT: andl $24, %ecx
+; FALLBACK21-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %esi
+; FALLBACK21-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl %esi, %edx
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edi
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edi
+; FALLBACK21-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %edx, 56(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT: shrl %cl, %eax
+; FALLBACK21-NEXT: movl %eax, 60(%ebp)
+; FALLBACK21-NEXT: movl %esi, 48(%ebp)
+; FALLBACK21-NEXT: movl %edi, 52(%ebp)
+; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 40(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 44(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 32(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 36(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 24(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %ebx, (%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 4(%ebp)
+; FALLBACK21-NEXT: addl $188, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: lshr_64bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $204, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK22-NEXT: movl (%eax), %ecx
+; FALLBACK22-NEXT: xorps %xmm4, %xmm4
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: leal (,%ecx,8), %edx
+; FALLBACK22-NEXT: andl $24, %edx
+; FALLBACK22-NEXT: andl $60, %ecx
+; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl 72(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %edi
+; FALLBACK22-NEXT: movl %edx, %ebx
+; FALLBACK22-NEXT: notb %bl
+; FALLBACK22-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp
+; FALLBACK22-NEXT: orl %edi, %ebp
+; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %edi, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT: orl %eax, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT: orl %eax, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: movl 112(%esp,%ecx), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %ecx
+; FALLBACK22-NEXT: movl 108(%esp,%eax), %esi
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl 120(%esp,%eax), %ebp
+; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: movl 116(%esp,%eax), %eax
+; FALLBACK22-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK22-NEXT: orl %edi, %ecx
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %eax, %eax
+; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK22-NEXT: shrxl %edx, %ebp, %edx
+; FALLBACK22-NEXT: addl %ebp, %ebp
+; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK22-NEXT: orl %eax, %ebx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl %edx, 60(%eax)
+; FALLBACK22-NEXT: movl %ebx, 56(%eax)
+; FALLBACK22-NEXT: movl %edi, 48(%eax)
+; FALLBACK22-NEXT: movl %ecx, 52(%eax)
+; FALLBACK22-NEXT: movl %esi, 40(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 44(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 32(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 36(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 24(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 28(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 16(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 20(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 8(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 12(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, (%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 4(%eax)
+; FALLBACK22-NEXT: addl $204, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: lshr_64bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $188, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movups (%ecx), %xmm0
+; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK23-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK23-NEXT: movl (%eax), %ecx
+; FALLBACK23-NEXT: xorps %xmm4, %xmm4
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %ecx, %ebp
+; FALLBACK23-NEXT: andl $60, %ebp
+; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shll $3, %ecx
+; FALLBACK23-NEXT: andl $24, %ecx
+; FALLBACK23-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %esi
+; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl %edi, %edx
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT: movl %eax, 56(%ebp)
+; FALLBACK23-NEXT: movl %esi, 48(%ebp)
+; FALLBACK23-NEXT: movl %edx, 52(%ebp)
+; FALLBACK23-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 44(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 32(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 36(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 24(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 28(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 16(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 20(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 8(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 12(%ebp)
+; FALLBACK23-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, (%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK23-NEXT: movl %eax, 60(%ebp)
+; FALLBACK23-NEXT: addl $188, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: lshr_64bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $204, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK24-NEXT: movl (%eax), %ecx
+; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, %esi
+; FALLBACK24-NEXT: andl $60, %esi
+; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK24-NEXT: shll $3, %ecx
+; FALLBACK24-NEXT: andl $24, %ecx
+; FALLBACK24-NEXT: movl %edx, %edi
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: movl 72(%esp,%esi), %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%eax,%eax), %ebx
+; FALLBACK24-NEXT: movl %ecx, %ebp
+; FALLBACK24-NEXT: movb %cl, %ch
+; FALLBACK24-NEXT: notb %ch
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK24-NEXT: movl %ebp, %eax
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %edx, %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %edi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK24-NEXT: movl %edx, %ebp
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %edx, %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: addl %eax, %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %eax, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK24-NEXT: leal (%edx,%edx), %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK24-NEXT: movl %edi, %ebp
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: addl %edi, %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl %edx, %edi
+; FALLBACK24-NEXT: movl %esi, %edx
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK24-NEXT: movl %esi, %ebx
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK24-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %eax, %edx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %ebx, 60(%eax)
+; FALLBACK24-NEXT: movl %edx, 56(%eax)
+; FALLBACK24-NEXT: movl %esi, 48(%eax)
+; FALLBACK24-NEXT: movl %ebp, 52(%eax)
+; FALLBACK24-NEXT: movl %edi, 40(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 44(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 32(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 36(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 24(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 28(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 16(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, (%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 4(%eax)
+; FALLBACK24-NEXT: addl $204, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: lshr_64bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $188, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK25-NEXT: movl (%eax), %ecx
+; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %ecx, %ebp
+; FALLBACK25-NEXT: andl $60, %ebp
+; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shll $3, %ecx
+; FALLBACK25-NEXT: andl $24, %ecx
+; FALLBACK25-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %esi
+; FALLBACK25-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl %esi, %edx
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edi
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edi
+; FALLBACK25-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %edx, 56(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT: shrl %cl, %eax
+; FALLBACK25-NEXT: movl %eax, 60(%ebp)
+; FALLBACK25-NEXT: movl %esi, 48(%ebp)
+; FALLBACK25-NEXT: movl %edi, 52(%ebp)
+; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 40(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 44(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 32(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 36(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 24(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %ebx, (%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 4(%ebp)
+; FALLBACK25-NEXT: addl $188, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: lshr_64bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $204, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK26-NEXT: movl (%eax), %ecx
+; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: leal (,%ecx,8), %edx
+; FALLBACK26-NEXT: andl $24, %edx
+; FALLBACK26-NEXT: andl $60, %ecx
+; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl 72(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %edi
+; FALLBACK26-NEXT: movl %edx, %ebx
+; FALLBACK26-NEXT: notb %bl
+; FALLBACK26-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp
+; FALLBACK26-NEXT: orl %edi, %ebp
+; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %edi, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT: orl %eax, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT: orl %eax, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %eax, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 120(%esp,%ecx), %ebp
+; FALLBACK26-NEXT: leal (%ebp,%ebp), %eax
+; FALLBACK26-NEXT: shlxl %ebx, %eax, %esi
+; FALLBACK26-NEXT: movl 116(%esp,%ecx), %eax
+; FALLBACK26-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK26-NEXT: orl %edi, %esi
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %eax, %eax
+; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK26-NEXT: movl 124(%esp,%ecx), %ecx
+; FALLBACK26-NEXT: shrxl %edx, %ecx, %edx
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebx
+; FALLBACK26-NEXT: orl %eax, %ebx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: movl %edx, 60(%ecx)
+; FALLBACK26-NEXT: movl %ebx, 56(%ecx)
+; FALLBACK26-NEXT: movl %edi, 48(%ecx)
+; FALLBACK26-NEXT: movl %esi, 52(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 40(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 44(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 32(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 36(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 24(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 28(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 16(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 20(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 8(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 12(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, (%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 4(%ecx)
+; FALLBACK26-NEXT: addl $204, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: lshr_64bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $188, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK27-NEXT: movl (%eax), %ecx
+; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %ecx, %ebp
+; FALLBACK27-NEXT: andl $60, %ebp
+; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shll $3, %ecx
+; FALLBACK27-NEXT: andl $24, %ecx
+; FALLBACK27-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %esi
+; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl %edi, %edx
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT: movl %eax, 56(%ebp)
+; FALLBACK27-NEXT: movl %esi, 48(%ebp)
+; FALLBACK27-NEXT: movl %edx, 52(%ebp)
+; FALLBACK27-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 44(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 32(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 36(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 24(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 28(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 16(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 20(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 8(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 12(%ebp)
+; FALLBACK27-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, (%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK27-NEXT: movl %eax, 60(%ebp)
+; FALLBACK27-NEXT: addl $188, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: lshr_64bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $204, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK28-NEXT: movl (%eax), %ecx
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, %esi
+; FALLBACK28-NEXT: andl $60, %esi
+; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK28-NEXT: shll $3, %ecx
+; FALLBACK28-NEXT: andl $24, %ecx
+; FALLBACK28-NEXT: movl %edx, %edi
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: movl 72(%esp,%esi), %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%eax,%eax), %ebx
+; FALLBACK28-NEXT: movl %ecx, %ebp
+; FALLBACK28-NEXT: movb %cl, %ch
+; FALLBACK28-NEXT: notb %ch
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK28-NEXT: movl %ebp, %eax
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %edx, %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %edi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK28-NEXT: movl %edx, %ebp
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %edx, %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: addl %eax, %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %eax, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK28-NEXT: leal (%edx,%edx), %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK28-NEXT: movl %edi, %ebp
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: addl %edi, %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl %edx, %edi
+; FALLBACK28-NEXT: movl %esi, %edx
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK28-NEXT: movl %esi, %ebx
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK28-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %eax, %edx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %ebx, 60(%eax)
+; FALLBACK28-NEXT: movl %edx, 56(%eax)
+; FALLBACK28-NEXT: movl %esi, 48(%eax)
+; FALLBACK28-NEXT: movl %ebp, 52(%eax)
+; FALLBACK28-NEXT: movl %edi, 40(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 44(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 32(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 36(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 24(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 28(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 16(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, (%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 4(%eax)
+; FALLBACK28-NEXT: addl $204, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: lshr_64bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $188, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK29-NEXT: movl (%eax), %ecx
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %ecx, %ebp
+; FALLBACK29-NEXT: andl $60, %ebp
+; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shll $3, %ecx
+; FALLBACK29-NEXT: andl $24, %ecx
+; FALLBACK29-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %esi
+; FALLBACK29-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl %esi, %edx
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edi
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edi
+; FALLBACK29-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %edx, 56(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT: shrl %cl, %eax
+; FALLBACK29-NEXT: movl %eax, 60(%ebp)
+; FALLBACK29-NEXT: movl %esi, 48(%ebp)
+; FALLBACK29-NEXT: movl %edi, 52(%ebp)
+; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 40(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 44(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 32(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 36(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 24(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %ebx, (%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 4(%ebp)
+; FALLBACK29-NEXT: addl $188, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: lshr_64bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $204, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK30-NEXT: movl (%eax), %edx
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: leal (,%edx,8), %ecx
+; FALLBACK30-NEXT: andl $24, %ecx
+; FALLBACK30-NEXT: andl $60, %edx
+; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi
+; FALLBACK30-NEXT: movl 72(%esp,%edx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, %esi, %edi
+; FALLBACK30-NEXT: movl %ecx, %ebx
+; FALLBACK30-NEXT: notb %bl
+; FALLBACK30-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT: orl %edi, %ebp
+; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, 64(%esp,%edx), %edi
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %edi, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 76(%esp,%edx), %edi
+; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT: orl %eax, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 88(%esp,%edx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 84(%esp,%edx), %edi
+; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 96(%esp,%edx), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 92(%esp,%edx), %edi
+; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT: orl %eax, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 104(%esp,%edx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 100(%esp,%edx), %edi
+; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 112(%esp,%edx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT: movl 108(%esp,%edx), %esi
+; FALLBACK30-NEXT: shrxl %ecx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %eax, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 120(%esp,%edx), %ebp
+; FALLBACK30-NEXT: leal (%ebp,%ebp), %eax
+; FALLBACK30-NEXT: shlxl %ebx, %eax, %esi
+; FALLBACK30-NEXT: movl 116(%esp,%edx), %eax
+; FALLBACK30-NEXT: shrxl %ecx, %eax, %edi
+; FALLBACK30-NEXT: orl %edi, %esi
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %eax, %eax
+; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrxl %ecx, %ebp, %eax
+; FALLBACK30-NEXT: movl 124(%esp,%edx), %edx
+; FALLBACK30-NEXT: shrxl %ecx, %edx, %ebp
+; FALLBACK30-NEXT: leal (%edx,%edx), %ecx
+; FALLBACK30-NEXT: shlxl %ebx, %ecx, %edx
+; FALLBACK30-NEXT: orl %eax, %edx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: movl %ebp, 60(%ecx)
+; FALLBACK30-NEXT: movl %edx, 56(%ecx)
+; FALLBACK30-NEXT: movl %edi, 48(%ecx)
+; FALLBACK30-NEXT: movl %esi, 52(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 40(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 44(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 32(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 36(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 24(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 28(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 16(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 20(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 8(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 12(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, (%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 4(%ecx)
+; FALLBACK30-NEXT: addl $204, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: lshr_64bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $188, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK31-NEXT: movl (%eax), %ecx
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %ecx, %ebp
+; FALLBACK31-NEXT: andl $60, %ebp
+; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shll $3, %ecx
+; FALLBACK31-NEXT: andl $24, %ecx
+; FALLBACK31-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %esi
+; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl %edi, %edx
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT: movl %eax, 56(%ebp)
+; FALLBACK31-NEXT: movl %esi, 48(%ebp)
+; FALLBACK31-NEXT: movl %edx, 52(%ebp)
+; FALLBACK31-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 44(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 32(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 36(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 24(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 28(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 16(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 20(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 8(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 12(%ebp)
+; FALLBACK31-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, (%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK31-NEXT: movl %eax, 60(%ebp)
+; FALLBACK31-NEXT: addl $188, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
+ %src = load i512, ptr %src.ptr, align 1
+ %byteOff = load i512, ptr %byteOff.ptr, align 1
+ %bitOff = shl i512 %byteOff, 3
+ %res = lshr i512 %src, %bitOff
+ store i512 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: lshr_64bytes_qwordOff:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pushq %rbx
; X64-SSE2-NEXT: movq (%rdi), %rax
@@ -1667,6 +15700,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: movq 48(%rdi), %rbx
; X64-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-SSE2-NEXT: movl (%rsi), %esi
+; X64-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -1675,23 +15713,15 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $63, %esi
-; X64-SSE2-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-SSE2-NEXT: movq -120(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %r8
-; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r9
-; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r10
-; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %r11
-; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %rsi
+; X64-SSE2-NEXT: andl $7, %esi
+; X64-SSE2-NEXT: movq -128(%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT: movq -120(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT: movq -104(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT: movq -112(%rsp,%rsi,8), %r8
+; X64-SSE2-NEXT: movq -88(%rsp,%rsi,8), %r9
+; X64-SSE2-NEXT: movq -96(%rsp,%rsi,8), %r10
+; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %r11
+; X64-SSE2-NEXT: movq -80(%rsp,%rsi,8), %rsi
; X64-SSE2-NEXT: movq %rsi, 48(%rdx)
; X64-SSE2-NEXT: movq %r11, 56(%rdx)
; X64-SSE2-NEXT: movq %r10, 32(%rdx)
@@ -1703,35 +15733,38 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: popq %rbx
; X64-SSE2-NEXT: retq
;
-; X64-SSE42-LABEL: lshr_64bytes:
+; X64-SSE42-LABEL: lshr_64bytes_qwordOff:
; X64-SSE42: # %bb.0:
+; X64-SSE42-NEXT: pushq %rax
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
; X64-SSE42-NEXT: movups 48(%rdi), %xmm3
; X64-SSE42-NEXT: movl (%rsi), %eax
; X64-SSE42-NEXT: xorps %xmm4, %xmm4
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm3, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $63, %eax
-; X64-SSE42-NEXT: movups -128(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT: movups -112(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT: movups -96(%rsp,%rax), %xmm2
-; X64-SSE42-NEXT: movups -80(%rsp,%rax), %xmm3
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: andl $7, %eax
+; X64-SSE42-NEXT: movups -128(%rsp,%rax,8), %xmm0
+; X64-SSE42-NEXT: movups -112(%rsp,%rax,8), %xmm1
+; X64-SSE42-NEXT: movups -96(%rsp,%rax,8), %xmm2
+; X64-SSE42-NEXT: movups -80(%rsp,%rax,8), %xmm3
; X64-SSE42-NEXT: movups %xmm3, 48(%rdx)
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
+; X64-SSE42-NEXT: popq %rax
; X64-SSE42-NEXT: retq
;
-; X64-AVX1-LABEL: lshr_64bytes:
+; X64-AVX1-LABEL: lshr_64bytes_qwordOff:
; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: pushq %rax
; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
; X64-AVX1-NEXT: movl (%rsi), %eax
@@ -1740,44 +15773,47 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: andl $63, %eax
-; X64-AVX1-NEXT: vmovups -128(%rsp,%rax), %xmm0
-; X64-AVX1-NEXT: vmovups -112(%rsp,%rax), %xmm1
-; X64-AVX1-NEXT: vmovups -96(%rsp,%rax), %xmm2
-; X64-AVX1-NEXT: vmovups -80(%rsp,%rax), %xmm3
+; X64-AVX1-NEXT: andl $7, %eax
+; X64-AVX1-NEXT: vmovups -128(%rsp,%rax,8), %xmm0
+; X64-AVX1-NEXT: vmovups -112(%rsp,%rax,8), %xmm1
+; X64-AVX1-NEXT: vmovups -96(%rsp,%rax,8), %xmm2
+; X64-AVX1-NEXT: vmovups -80(%rsp,%rax,8), %xmm3
; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx)
; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX1-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX1-NEXT: popq %rax
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
-; X64-AVX512-LABEL: lshr_64bytes:
+; X64-AVX512-LABEL: lshr_64bytes_qwordOff:
; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: pushq %rax
; X64-AVX512-NEXT: vmovups (%rdi), %zmm0
; X64-AVX512-NEXT: movl (%rsi), %eax
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: andl $63, %eax
-; X64-AVX512-NEXT: vmovups -128(%rsp,%rax), %xmm0
-; X64-AVX512-NEXT: vmovups -112(%rsp,%rax), %xmm1
-; X64-AVX512-NEXT: vmovups -96(%rsp,%rax), %xmm2
-; X64-AVX512-NEXT: vmovups -80(%rsp,%rax), %xmm3
+; X64-AVX512-NEXT: andl $7, %eax
+; X64-AVX512-NEXT: vmovups -128(%rsp,%rax,8), %xmm0
+; X64-AVX512-NEXT: vmovups -112(%rsp,%rax,8), %xmm1
+; X64-AVX512-NEXT: vmovups -96(%rsp,%rax,8), %xmm2
+; X64-AVX512-NEXT: vmovups -80(%rsp,%rax,8), %xmm3
; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx)
; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX512-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX512-NEXT: popq %rax
; X64-AVX512-NEXT: vzeroupper
; X64-AVX512-NEXT: retq
;
-; X86-SSE2-LABEL: lshr_64bytes:
+; X86-SSE2-LABEL: lshr_64bytes_qwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $168, %esp
+; X86-SSE2-NEXT: subl $188, %esp
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl (%eax), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1798,7 +15834,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl 32(%eax), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 36(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 40(%eax), %ebp
; X86-SSE2-NEXT: movl 44(%eax), %ebx
; X86-SSE2-NEXT: movl 48(%eax), %edi
@@ -1807,13 +15843,17 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl 60(%eax), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl (%eax), %eax
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -1821,6 +15861,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -1833,49 +15874,33 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $63, %eax
-; X86-SSE2-NEXT: movl 40(%esp,%eax), %ecx
+; X86-SSE2-NEXT: andl $7, %eax
+; X86-SSE2-NEXT: movl 48(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 52(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 60(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 56(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 48(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 68(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 64(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 56(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 76(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 72(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 84(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 80(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 72(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebp
-; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebx
-; X86-SSE2-NEXT: movl 92(%esp,%eax), %edi
-; X86-SSE2-NEXT: movl 88(%esp,%eax), %esi
-; X86-SSE2-NEXT: movl 100(%esp,%eax), %edx
-; X86-SSE2-NEXT: movl 96(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 92(%esp,%eax,8), %ebp
+; X86-SSE2-NEXT: movl 88(%esp,%eax,8), %ebx
+; X86-SSE2-NEXT: movl 100(%esp,%eax,8), %edi
+; X86-SSE2-NEXT: movl 96(%esp,%eax,8), %esi
+; X86-SSE2-NEXT: movl 108(%esp,%eax,8), %edx
+; X86-SSE2-NEXT: movl 104(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl %ecx, 56(%eax)
; X86-SSE2-NEXT: movl %edx, 60(%eax)
@@ -1883,7 +15908,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %edi, 52(%eax)
; X86-SSE2-NEXT: movl %ebx, 40(%eax)
; X86-SSE2-NEXT: movl %ebp, 44(%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 32(%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 36(%eax)
@@ -1903,16 +15928,16 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ecx, (%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $168, %esp
+; X86-SSE2-NEXT: addl $188, %esp
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: lshr_64bytes:
+; X86-SSE42-LABEL: lshr_64bytes_qwordOff:
; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $128, %esp
+; X86-SSE42-NEXT: subl $140, %esp
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -1922,29 +15947,29 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movups 48(%edx), %xmm3
; X86-SSE42-NEXT: movl (%ecx), %ecx
; X86-SSE42-NEXT: xorps %xmm4, %xmm4
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm3, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
-; X86-SSE42-NEXT: andl $63, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm2
-; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm3
+; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, (%esp)
+; X86-SSE42-NEXT: andl $7, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0
+; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1
+; X86-SSE42-NEXT: movups 32(%esp,%ecx,8), %xmm2
+; X86-SSE42-NEXT: movups 48(%esp,%ecx,8), %xmm3
; X86-SSE42-NEXT: movups %xmm3, 48(%eax)
; X86-SSE42-NEXT: movups %xmm2, 32(%eax)
; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $128, %esp
+; X86-SSE42-NEXT: addl $140, %esp
; X86-SSE42-NEXT: retl
;
-; X86-AVX1-LABEL: lshr_64bytes:
+; X86-AVX1-LABEL: lshr_64bytes_qwordOff:
; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: subl $128, %esp
+; X86-AVX1-NEXT: subl $140, %esp
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -1956,22 +15981,22 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
; X86-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
; X86-AVX1-NEXT: vmovups %ymm0, (%esp)
-; X86-AVX1-NEXT: andl $63, %ecx
-; X86-AVX1-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX1-NEXT: vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX1-NEXT: vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX1-NEXT: vmovups 48(%esp,%ecx), %xmm3
+; X86-AVX1-NEXT: andl $7, %ecx
+; X86-AVX1-NEXT: vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX1-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX1-NEXT: vmovups 32(%esp,%ecx,8), %xmm2
+; X86-AVX1-NEXT: vmovups 48(%esp,%ecx,8), %xmm3
; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax)
; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax)
; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax)
; X86-AVX1-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX1-NEXT: addl $128, %esp
+; X86-AVX1-NEXT: addl $140, %esp
; X86-AVX1-NEXT: vzeroupper
; X86-AVX1-NEXT: retl
;
-; X86-AVX512-LABEL: lshr_64bytes:
+; X86-AVX512-LABEL: lshr_64bytes_qwordOff:
; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: subl $128, %esp
+; X86-AVX512-NEXT: subl $140, %esp
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -1980,27 +16005,3801 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
; X86-AVX512-NEXT: vmovups %zmm0, (%esp)
-; X86-AVX512-NEXT: andl $63, %ecx
-; X86-AVX512-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX512-NEXT: vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX512-NEXT: vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX512-NEXT: vmovups 48(%esp,%ecx), %xmm3
+; X86-AVX512-NEXT: andl $7, %ecx
+; X86-AVX512-NEXT: vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX512-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX512-NEXT: vmovups 32(%esp,%ecx,8), %xmm2
+; X86-AVX512-NEXT: vmovups 48(%esp,%ecx,8), %xmm3
; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax)
; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax)
; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax)
; X86-AVX512-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT: addl $128, %esp
+; X86-AVX512-NEXT: addl $140, %esp
; X86-AVX512-NEXT: vzeroupper
; X86-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
- %byteOff = load i512, ptr %byteOff.ptr, align 1
- %bitOff = shl i512 %byteOff, 3
+ %qwordOff = load i512, ptr %qwordOff.ptr, align 1
+ %bitOff = shl i512 %qwordOff, 6
%res = lshr i512 %src, %bitOff
store i512 %res, ptr %dst, align 1
ret void
}
+
define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: shl_64bytes:
+; FALLBACK0-LABEL: shl_64bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %r15
+; FALLBACK0-NEXT: pushq %r14
+; FALLBACK0-NEXT: pushq %r13
+; FALLBACK0-NEXT: pushq %r12
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rax
+; FALLBACK0-NEXT: movq 8(%rdi), %rcx
+; FALLBACK0-NEXT: movq 16(%rdi), %r8
+; FALLBACK0-NEXT: movq 24(%rdi), %r9
+; FALLBACK0-NEXT: movq 32(%rdi), %r10
+; FALLBACK0-NEXT: movq 40(%rdi), %r11
+; FALLBACK0-NEXT: movq 48(%rdi), %rbx
+; FALLBACK0-NEXT: movq 56(%rdi), %rdi
+; FALLBACK0-NEXT: movl (%rsi), %esi
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: leal (,%rsi,8), %eax
+; FALLBACK0-NEXT: andl $56, %eax
+; FALLBACK0-NEXT: andl $56, %esi
+; FALLBACK0-NEXT: negl %esi
+; FALLBACK0-NEXT: movslq %esi, %rbx
+; FALLBACK0-NEXT: movq -64(%rsp,%rbx), %r8
+; FALLBACK0-NEXT: movq -56(%rsp,%rbx), %rdi
+; FALLBACK0-NEXT: movq %rdi, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq %r8, %r9
+; FALLBACK0-NEXT: shrq %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r9
+; FALLBACK0-NEXT: orq %r10, %r9
+; FALLBACK0-NEXT: movq -40(%rsp,%rbx), %r10
+; FALLBACK0-NEXT: movq %r10, %r14
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r14
+; FALLBACK0-NEXT: movq -48(%rsp,%rbx), %r15
+; FALLBACK0-NEXT: movq %r15, %r11
+; FALLBACK0-NEXT: shrq %r11
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: orq %r14, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r15
+; FALLBACK0-NEXT: shrq %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rdi
+; FALLBACK0-NEXT: orq %r15, %rdi
+; FALLBACK0-NEXT: movq -24(%rsp,%rbx), %r14
+; FALLBACK0-NEXT: movq %r14, %r12
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r12
+; FALLBACK0-NEXT: movq -32(%rsp,%rbx), %r13
+; FALLBACK0-NEXT: movq %r13, %r15
+; FALLBACK0-NEXT: shrq %r15
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r15
+; FALLBACK0-NEXT: orq %r12, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r13
+; FALLBACK0-NEXT: shrq %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: orq %r13, %r10
+; FALLBACK0-NEXT: movq -8(%rsp,%rbx), %r12
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r12
+; FALLBACK0-NEXT: movq -16(%rsp,%rbx), %rbx
+; FALLBACK0-NEXT: movq %rbx, %r13
+; FALLBACK0-NEXT: shrq %r13
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r13
+; FALLBACK0-NEXT: orq %r12, %r13
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rbx
+; FALLBACK0-NEXT: shrq %r14
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r14
+; FALLBACK0-NEXT: orq %rbx, %r14
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: movq %r8, (%rdx)
+; FALLBACK0-NEXT: movq %r14, 48(%rdx)
+; FALLBACK0-NEXT: movq %r13, 56(%rdx)
+; FALLBACK0-NEXT: movq %r10, 32(%rdx)
+; FALLBACK0-NEXT: movq %r15, 40(%rdx)
+; FALLBACK0-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK0-NEXT: movq %r11, 24(%rdx)
+; FALLBACK0-NEXT: movq %r9, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: popq %r12
+; FALLBACK0-NEXT: popq %r13
+; FALLBACK0-NEXT: popq %r14
+; FALLBACK0-NEXT: popq %r15
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: shl_64bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: pushq %r14
+; FALLBACK1-NEXT: pushq %rbx
+; FALLBACK1-NEXT: pushq %rax
+; FALLBACK1-NEXT: movq (%rdi), %rax
+; FALLBACK1-NEXT: movq 8(%rdi), %rcx
+; FALLBACK1-NEXT: movq 16(%rdi), %r8
+; FALLBACK1-NEXT: movq 24(%rdi), %r9
+; FALLBACK1-NEXT: movq 32(%rdi), %r10
+; FALLBACK1-NEXT: movq 40(%rdi), %r11
+; FALLBACK1-NEXT: movq 48(%rdi), %rbx
+; FALLBACK1-NEXT: movq 56(%rdi), %rdi
+; FALLBACK1-NEXT: movl (%rsi), %esi
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT: andl $56, %ecx
+; FALLBACK1-NEXT: andl $56, %esi
+; FALLBACK1-NEXT: negl %esi
+; FALLBACK1-NEXT: movslq %esi, %r9
+; FALLBACK1-NEXT: movq -48(%rsp,%r9), %rax
+; FALLBACK1-NEXT: movq -40(%rsp,%r9), %r10
+; FALLBACK1-NEXT: movq %r10, %rsi
+; FALLBACK1-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK1-NEXT: movq -64(%rsp,%r9), %r8
+; FALLBACK1-NEXT: movq -56(%rsp,%r9), %rdi
+; FALLBACK1-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK1-NEXT: movq -32(%rsp,%r9), %r11
+; FALLBACK1-NEXT: movq -24(%rsp,%r9), %rbx
+; FALLBACK1-NEXT: movq %rbx, %r14
+; FALLBACK1-NEXT: shldq %cl, %r11, %r14
+; FALLBACK1-NEXT: shldq %cl, %r10, %r11
+; FALLBACK1-NEXT: movq -16(%rsp,%r9), %r10
+; FALLBACK1-NEXT: movq -8(%rsp,%r9), %r9
+; FALLBACK1-NEXT: shldq %cl, %r10, %r9
+; FALLBACK1-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK1-NEXT: shldq %cl, %r8, %rdi
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: shlq %cl, %r8
+; FALLBACK1-NEXT: movq %r10, 48(%rdx)
+; FALLBACK1-NEXT: movq %r9, 56(%rdx)
+; FALLBACK1-NEXT: movq %r11, 32(%rdx)
+; FALLBACK1-NEXT: movq %r14, 40(%rdx)
+; FALLBACK1-NEXT: movq %rax, 16(%rdx)
+; FALLBACK1-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK1-NEXT: movq %r8, (%rdx)
+; FALLBACK1-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK1-NEXT: addq $8, %rsp
+; FALLBACK1-NEXT: popq %rbx
+; FALLBACK1-NEXT: popq %r14
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: shl_64bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: pushq %rbp
+; FALLBACK2-NEXT: pushq %r15
+; FALLBACK2-NEXT: pushq %r14
+; FALLBACK2-NEXT: pushq %r13
+; FALLBACK2-NEXT: pushq %r12
+; FALLBACK2-NEXT: pushq %rbx
+; FALLBACK2-NEXT: pushq %rax
+; FALLBACK2-NEXT: movq (%rdi), %rax
+; FALLBACK2-NEXT: movq 8(%rdi), %rcx
+; FALLBACK2-NEXT: movq 16(%rdi), %r8
+; FALLBACK2-NEXT: movq 24(%rdi), %r9
+; FALLBACK2-NEXT: movq 32(%rdi), %r10
+; FALLBACK2-NEXT: movq 40(%rdi), %r11
+; FALLBACK2-NEXT: movq 48(%rdi), %rbx
+; FALLBACK2-NEXT: movq 56(%rdi), %rdi
+; FALLBACK2-NEXT: movl (%rsi), %esi
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: leal (,%rsi,8), %eax
+; FALLBACK2-NEXT: andl $56, %eax
+; FALLBACK2-NEXT: andl $56, %esi
+; FALLBACK2-NEXT: negl %esi
+; FALLBACK2-NEXT: movslq %esi, %rsi
+; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r10
+; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %r9
+; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11
+; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r14
+; FALLBACK2-NEXT: shlxq %rax, %r14, %rbx
+; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r8
+; FALLBACK2-NEXT: shlxq %rax, %r8, %r15
+; FALLBACK2-NEXT: shlxq %rax, %r10, %r12
+; FALLBACK2-NEXT: movl %eax, %r13d
+; FALLBACK2-NEXT: notb %r13b
+; FALLBACK2-NEXT: shrq %r10
+; FALLBACK2-NEXT: shrxq %r13, %r10, %r10
+; FALLBACK2-NEXT: orq %r9, %r10
+; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r9
+; FALLBACK2-NEXT: shlxq %rax, %r9, %rbp
+; FALLBACK2-NEXT: shrq %r14
+; FALLBACK2-NEXT: shrxq %r13, %r14, %r14
+; FALLBACK2-NEXT: orq %r11, %r14
+; FALLBACK2-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11
+; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %rax
+; FALLBACK2-NEXT: shrq %rcx
+; FALLBACK2-NEXT: shrxq %r13, %rcx, %rcx
+; FALLBACK2-NEXT: orq %rbx, %rcx
+; FALLBACK2-NEXT: shrq %r9
+; FALLBACK2-NEXT: shrxq %r13, %r9, %r9
+; FALLBACK2-NEXT: orq %r15, %r9
+; FALLBACK2-NEXT: shrq %rdi
+; FALLBACK2-NEXT: shrxq %r13, %rdi, %rdi
+; FALLBACK2-NEXT: orq %rbp, %rdi
+; FALLBACK2-NEXT: shrq %rsi
+; FALLBACK2-NEXT: shrxq %r13, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r11, %rsi
+; FALLBACK2-NEXT: shrq %r8
+; FALLBACK2-NEXT: shrxq %r13, %r8, %r8
+; FALLBACK2-NEXT: orq %rax, %r8
+; FALLBACK2-NEXT: movq %r12, (%rdx)
+; FALLBACK2-NEXT: movq %r8, 48(%rdx)
+; FALLBACK2-NEXT: movq %rsi, 56(%rdx)
+; FALLBACK2-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK2-NEXT: movq %r9, 40(%rdx)
+; FALLBACK2-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK2-NEXT: movq %r14, 24(%rdx)
+; FALLBACK2-NEXT: movq %r10, 8(%rdx)
+; FALLBACK2-NEXT: addq $8, %rsp
+; FALLBACK2-NEXT: popq %rbx
+; FALLBACK2-NEXT: popq %r12
+; FALLBACK2-NEXT: popq %r13
+; FALLBACK2-NEXT: popq %r14
+; FALLBACK2-NEXT: popq %r15
+; FALLBACK2-NEXT: popq %rbp
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: shl_64bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: pushq %r14
+; FALLBACK3-NEXT: pushq %rbx
+; FALLBACK3-NEXT: pushq %rax
+; FALLBACK3-NEXT: movq (%rdi), %rax
+; FALLBACK3-NEXT: movq 8(%rdi), %rcx
+; FALLBACK3-NEXT: movq 16(%rdi), %r8
+; FALLBACK3-NEXT: movq 24(%rdi), %r9
+; FALLBACK3-NEXT: movq 32(%rdi), %r10
+; FALLBACK3-NEXT: movq 40(%rdi), %r11
+; FALLBACK3-NEXT: movq 48(%rdi), %rbx
+; FALLBACK3-NEXT: movq 56(%rdi), %rdi
+; FALLBACK3-NEXT: movl (%rsi), %esi
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT: andl $56, %ecx
+; FALLBACK3-NEXT: andl $56, %esi
+; FALLBACK3-NEXT: negl %esi
+; FALLBACK3-NEXT: movslq %esi, %r8
+; FALLBACK3-NEXT: movq -48(%rsp,%r8), %rax
+; FALLBACK3-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK3-NEXT: movq %r9, %rsi
+; FALLBACK3-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK3-NEXT: movq -64(%rsp,%r8), %r10
+; FALLBACK3-NEXT: movq -56(%rsp,%r8), %rdi
+; FALLBACK3-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK3-NEXT: movq -32(%rsp,%r8), %r11
+; FALLBACK3-NEXT: movq -24(%rsp,%r8), %rbx
+; FALLBACK3-NEXT: movq %rbx, %r14
+; FALLBACK3-NEXT: shldq %cl, %r11, %r14
+; FALLBACK3-NEXT: shldq %cl, %r9, %r11
+; FALLBACK3-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK3-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK3-NEXT: shldq %cl, %r9, %r8
+; FALLBACK3-NEXT: shldq %cl, %rbx, %r9
+; FALLBACK3-NEXT: shldq %cl, %r10, %rdi
+; FALLBACK3-NEXT: shlxq %rcx, %r10, %rcx
+; FALLBACK3-NEXT: movq %r9, 48(%rdx)
+; FALLBACK3-NEXT: movq %r8, 56(%rdx)
+; FALLBACK3-NEXT: movq %r11, 32(%rdx)
+; FALLBACK3-NEXT: movq %r14, 40(%rdx)
+; FALLBACK3-NEXT: movq %rax, 16(%rdx)
+; FALLBACK3-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK3-NEXT: movq %rcx, (%rdx)
+; FALLBACK3-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK3-NEXT: addq $8, %rsp
+; FALLBACK3-NEXT: popq %rbx
+; FALLBACK3-NEXT: popq %r14
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: shl_64bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %r15
+; FALLBACK4-NEXT: pushq %r14
+; FALLBACK4-NEXT: pushq %r13
+; FALLBACK4-NEXT: pushq %r12
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK4-NEXT: movl (%rsi), %ecx
+; FALLBACK4-NEXT: xorps %xmm4, %xmm4
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: leal (,%rcx,8), %eax
+; FALLBACK4-NEXT: andl $56, %eax
+; FALLBACK4-NEXT: andl $56, %ecx
+; FALLBACK4-NEXT: negl %ecx
+; FALLBACK4-NEXT: movslq %ecx, %r9
+; FALLBACK4-NEXT: movq -24(%rsp,%r9), %rdi
+; FALLBACK4-NEXT: movq %rdi, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: movq -32(%rsp,%r9), %r11
+; FALLBACK4-NEXT: movq %r11, %r8
+; FALLBACK4-NEXT: shrq %r8
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: orq %r10, %r8
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r11
+; FALLBACK4-NEXT: movq -40(%rsp,%r9), %rbx
+; FALLBACK4-NEXT: movq %rbx, %r10
+; FALLBACK4-NEXT: shrq %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: orq %r11, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r15
+; FALLBACK4-NEXT: movq %r15, %r11
+; FALLBACK4-NEXT: shrq %r11
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: orq %rbx, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r15
+; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r14
+; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r12
+; FALLBACK4-NEXT: movq %r12, %rbx
+; FALLBACK4-NEXT: shrq %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbx
+; FALLBACK4-NEXT: orq %r15, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r12
+; FALLBACK4-NEXT: movq %r14, %r15
+; FALLBACK4-NEXT: shrq %r15
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r15
+; FALLBACK4-NEXT: orq %r12, %r15
+; FALLBACK4-NEXT: movq -16(%rsp,%r9), %r12
+; FALLBACK4-NEXT: movq %r12, %r13
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r13
+; FALLBACK4-NEXT: shrq %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rdi
+; FALLBACK4-NEXT: orq %r13, %rdi
+; FALLBACK4-NEXT: movq -8(%rsp,%r9), %r9
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r9
+; FALLBACK4-NEXT: shrq %r12
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r12
+; FALLBACK4-NEXT: orq %r9, %r12
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r14
+; FALLBACK4-NEXT: movq %r14, (%rdx)
+; FALLBACK4-NEXT: movq %r12, 56(%rdx)
+; FALLBACK4-NEXT: movq %rdi, 48(%rdx)
+; FALLBACK4-NEXT: movq %r15, 8(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT: movq %r11, 24(%rdx)
+; FALLBACK4-NEXT: movq %r10, 32(%rdx)
+; FALLBACK4-NEXT: movq %r8, 40(%rdx)
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: popq %r12
+; FALLBACK4-NEXT: popq %r13
+; FALLBACK4-NEXT: popq %r14
+; FALLBACK4-NEXT: popq %r15
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: shl_64bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: pushq %r15
+; FALLBACK5-NEXT: pushq %r14
+; FALLBACK5-NEXT: pushq %rbx
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK5-NEXT: movl (%rsi), %eax
+; FALLBACK5-NEXT: xorps %xmm4, %xmm4
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: andl $56, %ecx
+; FALLBACK5-NEXT: andl $56, %eax
+; FALLBACK5-NEXT: negl %eax
+; FALLBACK5-NEXT: movslq %eax, %r8
+; FALLBACK5-NEXT: movq -32(%rsp,%r8), %rax
+; FALLBACK5-NEXT: movq -24(%rsp,%r8), %r9
+; FALLBACK5-NEXT: movq %r9, %rsi
+; FALLBACK5-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK5-NEXT: movq -40(%rsp,%r8), %rdi
+; FALLBACK5-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK5-NEXT: movq -48(%rsp,%r8), %r10
+; FALLBACK5-NEXT: shldq %cl, %r10, %rdi
+; FALLBACK5-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK5-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK5-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK5-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK5-NEXT: movq %r14, %r15
+; FALLBACK5-NEXT: shldq %cl, %r9, %r15
+; FALLBACK5-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK5-NEXT: shldq %cl, %r14, %r8
+; FALLBACK5-NEXT: movq %r11, %r9
+; FALLBACK5-NEXT: shlq %cl, %r9
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK5-NEXT: movq %r8, 56(%rdx)
+; FALLBACK5-NEXT: movq %r15, 48(%rdx)
+; FALLBACK5-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK5-NEXT: movq %r10, 16(%rdx)
+; FALLBACK5-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK5-NEXT: movq %rax, 32(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 40(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: popq %rbx
+; FALLBACK5-NEXT: popq %r14
+; FALLBACK5-NEXT: popq %r15
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: shl_64bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: pushq %rbp
+; FALLBACK6-NEXT: pushq %r15
+; FALLBACK6-NEXT: pushq %r14
+; FALLBACK6-NEXT: pushq %r13
+; FALLBACK6-NEXT: pushq %r12
+; FALLBACK6-NEXT: pushq %rbx
+; FALLBACK6-NEXT: subq $24, %rsp
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK6-NEXT: movl (%rsi), %eax
+; FALLBACK6-NEXT: xorps %xmm4, %xmm4
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm3, (%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: leal (,%rax,8), %ecx
+; FALLBACK6-NEXT: andl $56, %ecx
+; FALLBACK6-NEXT: andl $56, %eax
+; FALLBACK6-NEXT: negl %eax
+; FALLBACK6-NEXT: movslq %eax, %rsi
+; FALLBACK6-NEXT: movq -8(%rsp,%rsi), %rax
+; FALLBACK6-NEXT: shlxq %rcx, %rax, %r12
+; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %rdi
+; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r15
+; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r13
+; FALLBACK6-NEXT: shlxq %rcx, %r13, %r8
+; FALLBACK6-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r11
+; FALLBACK6-NEXT: shlxq %rcx, %r11, %r10
+; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r14
+; FALLBACK6-NEXT: shlxq %rcx, %r14, %rbx
+; FALLBACK6-NEXT: movl %ecx, %r9d
+; FALLBACK6-NEXT: notb %r9b
+; FALLBACK6-NEXT: shrq %rdi
+; FALLBACK6-NEXT: shrxq %r9, %rdi, %rdi
+; FALLBACK6-NEXT: orq %r12, %rdi
+; FALLBACK6-NEXT: movq (%rsp,%rsi), %rbp
+; FALLBACK6-NEXT: shlxq %rcx, %rbp, %r8
+; FALLBACK6-NEXT: shrq %r13
+; FALLBACK6-NEXT: shrxq %r9, %r13, %r12
+; FALLBACK6-NEXT: orq %r15, %r12
+; FALLBACK6-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15
+; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi
+; FALLBACK6-NEXT: shlxq %rcx, %rsi, %rcx
+; FALLBACK6-NEXT: shrq %r11
+; FALLBACK6-NEXT: shrxq %r9, %r11, %r11
+; FALLBACK6-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; FALLBACK6-NEXT: shrq %r14
+; FALLBACK6-NEXT: shrxq %r9, %r14, %r14
+; FALLBACK6-NEXT: orq %r10, %r14
+; FALLBACK6-NEXT: shrq %rsi
+; FALLBACK6-NEXT: shrxq %r9, %rsi, %rsi
+; FALLBACK6-NEXT: orq %rbx, %rsi
+; FALLBACK6-NEXT: shrq %rax
+; FALLBACK6-NEXT: shrxq %r9, %rax, %rax
+; FALLBACK6-NEXT: orq %r8, %rax
+; FALLBACK6-NEXT: shrq %rbp
+; FALLBACK6-NEXT: shrxq %r9, %rbp, %r8
+; FALLBACK6-NEXT: orq %r15, %r8
+; FALLBACK6-NEXT: movq %rcx, (%rdx)
+; FALLBACK6-NEXT: movq %r8, 56(%rdx)
+; FALLBACK6-NEXT: movq %rax, 48(%rdx)
+; FALLBACK6-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK6-NEXT: movq %r14, 16(%rdx)
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %r12, 32(%rdx)
+; FALLBACK6-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK6-NEXT: addq $24, %rsp
+; FALLBACK6-NEXT: popq %rbx
+; FALLBACK6-NEXT: popq %r12
+; FALLBACK6-NEXT: popq %r13
+; FALLBACK6-NEXT: popq %r14
+; FALLBACK6-NEXT: popq %r15
+; FALLBACK6-NEXT: popq %rbp
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: shl_64bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: pushq %r15
+; FALLBACK7-NEXT: pushq %r14
+; FALLBACK7-NEXT: pushq %rbx
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK7-NEXT: movl (%rsi), %eax
+; FALLBACK7-NEXT: xorps %xmm4, %xmm4
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: andl $56, %ecx
+; FALLBACK7-NEXT: andl $56, %eax
+; FALLBACK7-NEXT: negl %eax
+; FALLBACK7-NEXT: movslq %eax, %r8
+; FALLBACK7-NEXT: movq -32(%rsp,%r8), %rax
+; FALLBACK7-NEXT: movq -24(%rsp,%r8), %r9
+; FALLBACK7-NEXT: movq %r9, %rsi
+; FALLBACK7-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK7-NEXT: movq -40(%rsp,%r8), %rdi
+; FALLBACK7-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK7-NEXT: movq -48(%rsp,%r8), %r10
+; FALLBACK7-NEXT: shldq %cl, %r10, %rdi
+; FALLBACK7-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK7-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK7-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK7-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK7-NEXT: movq %r14, %r15
+; FALLBACK7-NEXT: shldq %cl, %r9, %r15
+; FALLBACK7-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK7-NEXT: shldq %cl, %r14, %r8
+; FALLBACK7-NEXT: shlxq %rcx, %r11, %r9
+; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK7-NEXT: movq %r8, 56(%rdx)
+; FALLBACK7-NEXT: movq %r15, 48(%rdx)
+; FALLBACK7-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK7-NEXT: movq %r10, 16(%rdx)
+; FALLBACK7-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK7-NEXT: movq %rax, 32(%rdx)
+; FALLBACK7-NEXT: movq %rsi, 40(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: popq %rbx
+; FALLBACK7-NEXT: popq %r14
+; FALLBACK7-NEXT: popq %r15
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: shl_64bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %r15
+; FALLBACK8-NEXT: pushq %r14
+; FALLBACK8-NEXT: pushq %r13
+; FALLBACK8-NEXT: pushq %r12
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK8-NEXT: movl (%rsi), %ecx
+; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: leal (,%rcx,8), %eax
+; FALLBACK8-NEXT: andl $56, %eax
+; FALLBACK8-NEXT: andl $56, %ecx
+; FALLBACK8-NEXT: negl %ecx
+; FALLBACK8-NEXT: movslq %ecx, %r9
+; FALLBACK8-NEXT: movq -24(%rsp,%r9), %rdi
+; FALLBACK8-NEXT: movq %rdi, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: movq -32(%rsp,%r9), %r11
+; FALLBACK8-NEXT: movq %r11, %r8
+; FALLBACK8-NEXT: shrq %r8
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: orq %r10, %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r11
+; FALLBACK8-NEXT: movq -40(%rsp,%r9), %rbx
+; FALLBACK8-NEXT: movq %rbx, %r10
+; FALLBACK8-NEXT: shrq %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: orq %r11, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r15
+; FALLBACK8-NEXT: movq %r15, %r11
+; FALLBACK8-NEXT: shrq %r11
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: orq %rbx, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r15
+; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r14
+; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r12
+; FALLBACK8-NEXT: movq %r12, %rbx
+; FALLBACK8-NEXT: shrq %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbx
+; FALLBACK8-NEXT: orq %r15, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r12
+; FALLBACK8-NEXT: movq %r14, %r15
+; FALLBACK8-NEXT: shrq %r15
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r15
+; FALLBACK8-NEXT: orq %r12, %r15
+; FALLBACK8-NEXT: movq -16(%rsp,%r9), %r12
+; FALLBACK8-NEXT: movq %r12, %r13
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r13
+; FALLBACK8-NEXT: shrq %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rdi
+; FALLBACK8-NEXT: orq %r13, %rdi
+; FALLBACK8-NEXT: movq -8(%rsp,%r9), %r9
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r9
+; FALLBACK8-NEXT: shrq %r12
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r12
+; FALLBACK8-NEXT: orq %r9, %r12
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r14
+; FALLBACK8-NEXT: movq %r14, (%rdx)
+; FALLBACK8-NEXT: movq %r12, 56(%rdx)
+; FALLBACK8-NEXT: movq %rdi, 48(%rdx)
+; FALLBACK8-NEXT: movq %r15, 8(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT: movq %r11, 24(%rdx)
+; FALLBACK8-NEXT: movq %r10, 32(%rdx)
+; FALLBACK8-NEXT: movq %r8, 40(%rdx)
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: popq %r12
+; FALLBACK8-NEXT: popq %r13
+; FALLBACK8-NEXT: popq %r14
+; FALLBACK8-NEXT: popq %r15
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: shl_64bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: pushq %r15
+; FALLBACK9-NEXT: pushq %r14
+; FALLBACK9-NEXT: pushq %rbx
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK9-NEXT: movl (%rsi), %eax
+; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: andl $56, %ecx
+; FALLBACK9-NEXT: andl $56, %eax
+; FALLBACK9-NEXT: negl %eax
+; FALLBACK9-NEXT: movslq %eax, %r8
+; FALLBACK9-NEXT: movq -32(%rsp,%r8), %rax
+; FALLBACK9-NEXT: movq -24(%rsp,%r8), %r9
+; FALLBACK9-NEXT: movq %r9, %rsi
+; FALLBACK9-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK9-NEXT: movq -40(%rsp,%r8), %rdi
+; FALLBACK9-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK9-NEXT: movq -48(%rsp,%r8), %r10
+; FALLBACK9-NEXT: shldq %cl, %r10, %rdi
+; FALLBACK9-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK9-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK9-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK9-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK9-NEXT: movq %r14, %r15
+; FALLBACK9-NEXT: shldq %cl, %r9, %r15
+; FALLBACK9-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK9-NEXT: shldq %cl, %r14, %r8
+; FALLBACK9-NEXT: movq %r11, %r9
+; FALLBACK9-NEXT: shlq %cl, %r9
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK9-NEXT: movq %r8, 56(%rdx)
+; FALLBACK9-NEXT: movq %r15, 48(%rdx)
+; FALLBACK9-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK9-NEXT: movq %r10, 16(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK9-NEXT: movq %rax, 32(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 40(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: popq %rbx
+; FALLBACK9-NEXT: popq %r14
+; FALLBACK9-NEXT: popq %r15
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: shl_64bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: pushq %rbp
+; FALLBACK10-NEXT: pushq %r15
+; FALLBACK10-NEXT: pushq %r14
+; FALLBACK10-NEXT: pushq %r13
+; FALLBACK10-NEXT: pushq %r12
+; FALLBACK10-NEXT: pushq %rbx
+; FALLBACK10-NEXT: subq $24, %rsp
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK10-NEXT: movl (%rsi), %eax
+; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: leal (,%rax,8), %ecx
+; FALLBACK10-NEXT: andl $56, %ecx
+; FALLBACK10-NEXT: andl $56, %eax
+; FALLBACK10-NEXT: negl %eax
+; FALLBACK10-NEXT: movslq %eax, %rsi
+; FALLBACK10-NEXT: movq -8(%rsp,%rsi), %rax
+; FALLBACK10-NEXT: shlxq %rcx, %rax, %r12
+; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %rdi
+; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r15
+; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r13
+; FALLBACK10-NEXT: shlxq %rcx, %r13, %r8
+; FALLBACK10-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r11
+; FALLBACK10-NEXT: shlxq %rcx, %r11, %r10
+; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r14
+; FALLBACK10-NEXT: shlxq %rcx, %r14, %rbx
+; FALLBACK10-NEXT: movl %ecx, %r9d
+; FALLBACK10-NEXT: notb %r9b
+; FALLBACK10-NEXT: shrq %rdi
+; FALLBACK10-NEXT: shrxq %r9, %rdi, %rdi
+; FALLBACK10-NEXT: orq %r12, %rdi
+; FALLBACK10-NEXT: movq (%rsp,%rsi), %rbp
+; FALLBACK10-NEXT: shlxq %rcx, %rbp, %r8
+; FALLBACK10-NEXT: shrq %r13
+; FALLBACK10-NEXT: shrxq %r9, %r13, %r12
+; FALLBACK10-NEXT: orq %r15, %r12
+; FALLBACK10-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15
+; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi
+; FALLBACK10-NEXT: shlxq %rcx, %rsi, %rcx
+; FALLBACK10-NEXT: shrq %r11
+; FALLBACK10-NEXT: shrxq %r9, %r11, %r11
+; FALLBACK10-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; FALLBACK10-NEXT: shrq %r14
+; FALLBACK10-NEXT: shrxq %r9, %r14, %r14
+; FALLBACK10-NEXT: orq %r10, %r14
+; FALLBACK10-NEXT: shrq %rsi
+; FALLBACK10-NEXT: shrxq %r9, %rsi, %rsi
+; FALLBACK10-NEXT: orq %rbx, %rsi
+; FALLBACK10-NEXT: shrq %rax
+; FALLBACK10-NEXT: shrxq %r9, %rax, %rax
+; FALLBACK10-NEXT: orq %r8, %rax
+; FALLBACK10-NEXT: shrq %rbp
+; FALLBACK10-NEXT: shrxq %r9, %rbp, %r8
+; FALLBACK10-NEXT: orq %r15, %r8
+; FALLBACK10-NEXT: movq %rcx, (%rdx)
+; FALLBACK10-NEXT: movq %r8, 56(%rdx)
+; FALLBACK10-NEXT: movq %rax, 48(%rdx)
+; FALLBACK10-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK10-NEXT: movq %r14, 16(%rdx)
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %r12, 32(%rdx)
+; FALLBACK10-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK10-NEXT: addq $24, %rsp
+; FALLBACK10-NEXT: popq %rbx
+; FALLBACK10-NEXT: popq %r12
+; FALLBACK10-NEXT: popq %r13
+; FALLBACK10-NEXT: popq %r14
+; FALLBACK10-NEXT: popq %r15
+; FALLBACK10-NEXT: popq %rbp
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: shl_64bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: pushq %r15
+; FALLBACK11-NEXT: pushq %r14
+; FALLBACK11-NEXT: pushq %rbx
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK11-NEXT: movl (%rsi), %eax
+; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: andl $56, %ecx
+; FALLBACK11-NEXT: andl $56, %eax
+; FALLBACK11-NEXT: negl %eax
+; FALLBACK11-NEXT: movslq %eax, %r8
+; FALLBACK11-NEXT: movq -32(%rsp,%r8), %rax
+; FALLBACK11-NEXT: movq -24(%rsp,%r8), %r9
+; FALLBACK11-NEXT: movq %r9, %rsi
+; FALLBACK11-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK11-NEXT: movq -40(%rsp,%r8), %rdi
+; FALLBACK11-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK11-NEXT: movq -48(%rsp,%r8), %r10
+; FALLBACK11-NEXT: shldq %cl, %r10, %rdi
+; FALLBACK11-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK11-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK11-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK11-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK11-NEXT: movq %r14, %r15
+; FALLBACK11-NEXT: shldq %cl, %r9, %r15
+; FALLBACK11-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK11-NEXT: shldq %cl, %r14, %r8
+; FALLBACK11-NEXT: shlxq %rcx, %r11, %r9
+; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK11-NEXT: movq %r8, 56(%rdx)
+; FALLBACK11-NEXT: movq %r15, 48(%rdx)
+; FALLBACK11-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK11-NEXT: movq %r10, 16(%rdx)
+; FALLBACK11-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK11-NEXT: movq %rax, 32(%rdx)
+; FALLBACK11-NEXT: movq %rsi, 40(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: popq %rbx
+; FALLBACK11-NEXT: popq %r14
+; FALLBACK11-NEXT: popq %r15
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: shl_64bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %r15
+; FALLBACK12-NEXT: pushq %r14
+; FALLBACK12-NEXT: pushq %r13
+; FALLBACK12-NEXT: pushq %r12
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK12-NEXT: movl (%rsi), %ecx
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: leal (,%rcx,8), %eax
+; FALLBACK12-NEXT: andl $56, %eax
+; FALLBACK12-NEXT: andl $56, %ecx
+; FALLBACK12-NEXT: negl %ecx
+; FALLBACK12-NEXT: movslq %ecx, %r9
+; FALLBACK12-NEXT: movq -24(%rsp,%r9), %rdi
+; FALLBACK12-NEXT: movq %rdi, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: movq -32(%rsp,%r9), %r11
+; FALLBACK12-NEXT: movq %r11, %r8
+; FALLBACK12-NEXT: shrq %r8
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: orq %r10, %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r11
+; FALLBACK12-NEXT: movq -40(%rsp,%r9), %rbx
+; FALLBACK12-NEXT: movq %rbx, %r10
+; FALLBACK12-NEXT: shrq %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: orq %r11, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r15
+; FALLBACK12-NEXT: movq %r15, %r11
+; FALLBACK12-NEXT: shrq %r11
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: orq %rbx, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r15
+; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r14
+; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r12
+; FALLBACK12-NEXT: movq %r12, %rbx
+; FALLBACK12-NEXT: shrq %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbx
+; FALLBACK12-NEXT: orq %r15, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r12
+; FALLBACK12-NEXT: movq %r14, %r15
+; FALLBACK12-NEXT: shrq %r15
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r15
+; FALLBACK12-NEXT: orq %r12, %r15
+; FALLBACK12-NEXT: movq -16(%rsp,%r9), %r12
+; FALLBACK12-NEXT: movq %r12, %r13
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r13
+; FALLBACK12-NEXT: shrq %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rdi
+; FALLBACK12-NEXT: orq %r13, %rdi
+; FALLBACK12-NEXT: movq -8(%rsp,%r9), %r9
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r9
+; FALLBACK12-NEXT: shrq %r12
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r12
+; FALLBACK12-NEXT: orq %r9, %r12
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r14
+; FALLBACK12-NEXT: movq %r14, (%rdx)
+; FALLBACK12-NEXT: movq %r12, 56(%rdx)
+; FALLBACK12-NEXT: movq %rdi, 48(%rdx)
+; FALLBACK12-NEXT: movq %r15, 8(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT: movq %r11, 24(%rdx)
+; FALLBACK12-NEXT: movq %r10, 32(%rdx)
+; FALLBACK12-NEXT: movq %r8, 40(%rdx)
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: popq %r12
+; FALLBACK12-NEXT: popq %r13
+; FALLBACK12-NEXT: popq %r14
+; FALLBACK12-NEXT: popq %r15
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: shl_64bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: pushq %r15
+; FALLBACK13-NEXT: pushq %r14
+; FALLBACK13-NEXT: pushq %rbx
+; FALLBACK13-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK13-NEXT: movl (%rsi), %eax
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: leal (,%rax,8), %ecx
+; FALLBACK13-NEXT: andl $56, %ecx
+; FALLBACK13-NEXT: andl $56, %eax
+; FALLBACK13-NEXT: negl %eax
+; FALLBACK13-NEXT: movslq %eax, %r8
+; FALLBACK13-NEXT: movq -32(%rsp,%r8), %rax
+; FALLBACK13-NEXT: movq -24(%rsp,%r8), %r9
+; FALLBACK13-NEXT: movq %r9, %rsi
+; FALLBACK13-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK13-NEXT: movq -40(%rsp,%r8), %rdi
+; FALLBACK13-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK13-NEXT: movq -48(%rsp,%r8), %r10
+; FALLBACK13-NEXT: shldq %cl, %r10, %rdi
+; FALLBACK13-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK13-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK13-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK13-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK13-NEXT: movq %r14, %r15
+; FALLBACK13-NEXT: shldq %cl, %r9, %r15
+; FALLBACK13-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK13-NEXT: shldq %cl, %r14, %r8
+; FALLBACK13-NEXT: movq %r11, %r9
+; FALLBACK13-NEXT: shlq %cl, %r9
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK13-NEXT: movq %r8, 56(%rdx)
+; FALLBACK13-NEXT: movq %r15, 48(%rdx)
+; FALLBACK13-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK13-NEXT: movq %r10, 16(%rdx)
+; FALLBACK13-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK13-NEXT: movq %rax, 32(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 40(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: popq %rbx
+; FALLBACK13-NEXT: popq %r14
+; FALLBACK13-NEXT: popq %r15
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: shl_64bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: pushq %rbp
+; FALLBACK14-NEXT: pushq %r15
+; FALLBACK14-NEXT: pushq %r14
+; FALLBACK14-NEXT: pushq %r13
+; FALLBACK14-NEXT: pushq %r12
+; FALLBACK14-NEXT: pushq %rbx
+; FALLBACK14-NEXT: subq $24, %rsp
+; FALLBACK14-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK14-NEXT: movl (%rsi), %eax
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: leal (,%rax,8), %ecx
+; FALLBACK14-NEXT: andl $56, %ecx
+; FALLBACK14-NEXT: andl $56, %eax
+; FALLBACK14-NEXT: negl %eax
+; FALLBACK14-NEXT: movslq %eax, %rsi
+; FALLBACK14-NEXT: movq -8(%rsp,%rsi), %rax
+; FALLBACK14-NEXT: shlxq %rcx, %rax, %r12
+; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r15
+; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r13
+; FALLBACK14-NEXT: shlxq %rcx, %r13, %r8
+; FALLBACK14-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r11
+; FALLBACK14-NEXT: shlxq %rcx, %r11, %r10
+; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r14
+; FALLBACK14-NEXT: shlxq %rcx, %r14, %rbx
+; FALLBACK14-NEXT: movl %ecx, %r9d
+; FALLBACK14-NEXT: notb %r9b
+; FALLBACK14-NEXT: shrq %rdi
+; FALLBACK14-NEXT: shrxq %r9, %rdi, %rdi
+; FALLBACK14-NEXT: orq %r12, %rdi
+; FALLBACK14-NEXT: movq (%rsp,%rsi), %rbp
+; FALLBACK14-NEXT: shlxq %rcx, %rbp, %r8
+; FALLBACK14-NEXT: shrq %r13
+; FALLBACK14-NEXT: shrxq %r9, %r13, %r12
+; FALLBACK14-NEXT: orq %r15, %r12
+; FALLBACK14-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15
+; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT: shlxq %rcx, %rsi, %rcx
+; FALLBACK14-NEXT: shrq %r11
+; FALLBACK14-NEXT: shrxq %r9, %r11, %r11
+; FALLBACK14-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; FALLBACK14-NEXT: shrq %r14
+; FALLBACK14-NEXT: shrxq %r9, %r14, %r14
+; FALLBACK14-NEXT: orq %r10, %r14
+; FALLBACK14-NEXT: shrq %rsi
+; FALLBACK14-NEXT: shrxq %r9, %rsi, %rsi
+; FALLBACK14-NEXT: orq %rbx, %rsi
+; FALLBACK14-NEXT: shrq %rax
+; FALLBACK14-NEXT: shrxq %r9, %rax, %rax
+; FALLBACK14-NEXT: orq %r8, %rax
+; FALLBACK14-NEXT: shrq %rbp
+; FALLBACK14-NEXT: shrxq %r9, %rbp, %r8
+; FALLBACK14-NEXT: orq %r15, %r8
+; FALLBACK14-NEXT: movq %rcx, (%rdx)
+; FALLBACK14-NEXT: movq %r8, 56(%rdx)
+; FALLBACK14-NEXT: movq %rax, 48(%rdx)
+; FALLBACK14-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK14-NEXT: movq %r14, 16(%rdx)
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %r12, 32(%rdx)
+; FALLBACK14-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK14-NEXT: addq $24, %rsp
+; FALLBACK14-NEXT: popq %rbx
+; FALLBACK14-NEXT: popq %r12
+; FALLBACK14-NEXT: popq %r13
+; FALLBACK14-NEXT: popq %r14
+; FALLBACK14-NEXT: popq %r15
+; FALLBACK14-NEXT: popq %rbp
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: shl_64bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: pushq %r15
+; FALLBACK15-NEXT: pushq %r14
+; FALLBACK15-NEXT: pushq %rbx
+; FALLBACK15-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK15-NEXT: movl (%rsi), %eax
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: andl $56, %ecx
+; FALLBACK15-NEXT: andl $56, %eax
+; FALLBACK15-NEXT: negl %eax
+; FALLBACK15-NEXT: movslq %eax, %r8
+; FALLBACK15-NEXT: movq -32(%rsp,%r8), %rax
+; FALLBACK15-NEXT: movq -24(%rsp,%r8), %r9
+; FALLBACK15-NEXT: movq %r9, %rsi
+; FALLBACK15-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK15-NEXT: movq -40(%rsp,%r8), %rdi
+; FALLBACK15-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK15-NEXT: movq -48(%rsp,%r8), %r10
+; FALLBACK15-NEXT: shldq %cl, %r10, %rdi
+; FALLBACK15-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK15-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK15-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK15-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK15-NEXT: movq %r14, %r15
+; FALLBACK15-NEXT: shldq %cl, %r9, %r15
+; FALLBACK15-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK15-NEXT: shldq %cl, %r14, %r8
+; FALLBACK15-NEXT: shlxq %rcx, %r11, %r9
+; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK15-NEXT: movq %r8, 56(%rdx)
+; FALLBACK15-NEXT: movq %r15, 48(%rdx)
+; FALLBACK15-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK15-NEXT: movq %r10, 16(%rdx)
+; FALLBACK15-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK15-NEXT: movq %rax, 32(%rdx)
+; FALLBACK15-NEXT: movq %rsi, 40(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: popq %rbx
+; FALLBACK15-NEXT: popq %r14
+; FALLBACK15-NEXT: popq %r15
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: shl_64bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $204, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl (%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 16(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 20(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 24(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 28(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 36(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 40(%eax), %ebp
+; FALLBACK16-NEXT: movl 44(%eax), %ebx
+; FALLBACK16-NEXT: movl 48(%eax), %edi
+; FALLBACK16-NEXT: movl 52(%eax), %esi
+; FALLBACK16-NEXT: movl 56(%eax), %edx
+; FALLBACK16-NEXT: movl 60(%eax), %ecx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl (%eax), %eax
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %eax, %edx
+; FALLBACK16-NEXT: andl $60, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: subl %edx, %ecx
+; FALLBACK16-NEXT: movl (%ecx), %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%ecx), %edx
+; FALLBACK16-NEXT: movl %ecx, %ebp
+; FALLBACK16-NEXT: shll $3, %eax
+; FALLBACK16-NEXT: andl $24, %eax
+; FALLBACK16-NEXT: movl %edx, %esi
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %al, %ch
+; FALLBACK16-NEXT: notb %ch
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %esi, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%ebp), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 8(%ebp), %esi
+; FALLBACK16-NEXT: movl %ebp, %edi
+; FALLBACK16-NEXT: movl %esi, %ebp
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %ebx, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %esi, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %edi, %ebp
+; FALLBACK16-NEXT: movl 20(%edi), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 16(%edi), %esi
+; FALLBACK16-NEXT: movl %esi, %edx
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %ebx, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %esi, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %ebp, %edx
+; FALLBACK16-NEXT: movl 28(%ebp), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 24(%ebp), %esi
+; FALLBACK16-NEXT: movl %esi, %edi
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %ebx, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %esi, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 36(%edx), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 32(%edx), %esi
+; FALLBACK16-NEXT: movl %edx, %ebp
+; FALLBACK16-NEXT: movl %esi, %edi
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %ebx, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %esi, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 44(%ebp), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 40(%ebp), %esi
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %esi, %edx
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %ebx, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %esi, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 52(%ebp), %esi
+; FALLBACK16-NEXT: movl %esi, %edi
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: negl %edx
+; FALLBACK16-NEXT: movl 176(%esp,%edx), %ebx
+; FALLBACK16-NEXT: movl %ebx, %ebp
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %edi, %ebp
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %ebx, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK16-NEXT: movl 60(%edi), %edx
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: movl 56(%edi), %ebx
+; FALLBACK16-NEXT: movl %ebx, %edi
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %edx, %edi
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: shrl %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: orl %ebx, %esi
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl %edx, (%eax)
+; FALLBACK16-NEXT: movl %esi, 56(%eax)
+; FALLBACK16-NEXT: movl %edi, 60(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 48(%eax)
+; FALLBACK16-NEXT: movl %ebp, 52(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 40(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 44(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 32(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 36(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 24(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 28(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 16(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 20(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 8(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 12(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 4(%eax)
+; FALLBACK16-NEXT: addl $204, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: shl_64bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $188, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 12(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 20(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 24(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 28(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 36(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%ecx), %ebp
+; FALLBACK17-NEXT: movl 44(%ecx), %ebx
+; FALLBACK17-NEXT: movl 48(%ecx), %edi
+; FALLBACK17-NEXT: movl 52(%ecx), %esi
+; FALLBACK17-NEXT: movl 56(%ecx), %edx
+; FALLBACK17-NEXT: movl 60(%ecx), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %ecx
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ecx, %ebp
+; FALLBACK17-NEXT: andl $60, %ebp
+; FALLBACK17-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: subl %ebp, %eax
+; FALLBACK17-NEXT: movl 8(%eax), %esi
+; FALLBACK17-NEXT: movl 12(%eax), %edx
+; FALLBACK17-NEXT: shll $3, %ecx
+; FALLBACK17-NEXT: andl $24, %ecx
+; FALLBACK17-NEXT: movl %edx, %edi
+; FALLBACK17-NEXT: shldl %cl, %esi, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%eax), %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edi, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%eax), %edi
+; FALLBACK17-NEXT: movl 20(%eax), %esi
+; FALLBACK17-NEXT: movl %esi, %ebx
+; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edx, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 24(%eax), %edi
+; FALLBACK17-NEXT: movl 28(%eax), %edx
+; FALLBACK17-NEXT: movl %edx, %ebx
+; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %esi, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%eax), %edi
+; FALLBACK17-NEXT: movl 36(%eax), %esi
+; FALLBACK17-NEXT: movl %esi, %ebx
+; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edx, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%eax), %edx
+; FALLBACK17-NEXT: movl 44(%eax), %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edx, %edi
+; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 56(%eax), %edx
+; FALLBACK17-NEXT: movl 60(%eax), %edi
+; FALLBACK17-NEXT: shldl %cl, %edx, %edi
+; FALLBACK17-NEXT: movl (%eax), %ebx
+; FALLBACK17-NEXT: movl 52(%eax), %esi
+; FALLBACK17-NEXT: shldl %cl, %esi, %edx
+; FALLBACK17-NEXT: negl %ebp
+; FALLBACK17-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl %edx, 56(%ebp)
+; FALLBACK17-NEXT: movl %edi, 60(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK17-NEXT: shll %cl, %ebx
+; FALLBACK17-NEXT: shldl %cl, %eax, %esi
+; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK17-NEXT: shldl %cl, %edi, %eax
+; FALLBACK17-NEXT: movl %eax, 48(%ebp)
+; FALLBACK17-NEXT: movl %esi, 52(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 40(%ebp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 44(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 32(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 36(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 24(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 28(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 16(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 20(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 8(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 12(%ebp)
+; FALLBACK17-NEXT: movl %ebx, (%ebp)
+; FALLBACK17-NEXT: movl %edx, 4(%ebp)
+; FALLBACK17-NEXT: addl $188, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: shl_64bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $204, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 12(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 16(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 20(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 24(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 28(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 32(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 36(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 40(%eax), %ebx
+; FALLBACK18-NEXT: movl 44(%eax), %edi
+; FALLBACK18-NEXT: movl 48(%eax), %esi
+; FALLBACK18-NEXT: movl 52(%eax), %edx
+; FALLBACK18-NEXT: movl 56(%eax), %ecx
+; FALLBACK18-NEXT: movl 60(%eax), %eax
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK18-NEXT: movl (%ebp), %ebp
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: leal (,%ebp,8), %edx
+; FALLBACK18-NEXT: andl $24, %edx
+; FALLBACK18-NEXT: andl $60, %ebp
+; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edi
+; FALLBACK18-NEXT: subl %ebp, %edi
+; FALLBACK18-NEXT: movl (%edi), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%edi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl %edx, %ebx
+; FALLBACK18-NEXT: notb %bl
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %esi
+; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx
+; FALLBACK18-NEXT: orl %ecx, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%edi), %esi
+; FALLBACK18-NEXT: movl %esi, %ecx
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT: movl 12(%edi), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: shrl %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 16(%edi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT: movl 20(%edi), %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT: orl %eax, %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 24(%edi), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT: movl 28(%edi), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %eax, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 32(%edi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT: movl 36(%edi), %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT: orl %eax, %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 40(%edi), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT: movl 44(%edi), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %eax, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 48(%edi), %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: movl 52(%edi), %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ebp
+; FALLBACK18-NEXT: orl %eax, %ebp
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: negl %eax
+; FALLBACK18-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK18-NEXT: movl 56(%edi), %eax
+; FALLBACK18-NEXT: shlxl %edx, %eax, %edx
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %edx, %esi
+; FALLBACK18-NEXT: shrl %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT: orl %eax, %ecx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, (%eax)
+; FALLBACK18-NEXT: movl %esi, 56(%eax)
+; FALLBACK18-NEXT: movl %ecx, 60(%eax)
+; FALLBACK18-NEXT: movl %ebp, 48(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 52(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 40(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 44(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 32(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 36(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 24(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 28(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 16(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $204, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: shl_64bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $204, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl (%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 12(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 20(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 24(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 28(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 36(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%ebp), %ebx
+; FALLBACK19-NEXT: movl 44(%ebp), %edi
+; FALLBACK19-NEXT: movl 48(%ebp), %esi
+; FALLBACK19-NEXT: movl 52(%ebp), %edx
+; FALLBACK19-NEXT: movl 56(%ebp), %ecx
+; FALLBACK19-NEXT: movl 60(%ebp), %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl (%ebp), %ebp
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: leal (,%ebp,8), %ecx
+; FALLBACK19-NEXT: andl $24, %ecx
+; FALLBACK19-NEXT: andl $60, %ebp
+; FALLBACK19-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: subl %ebp, %eax
+; FALLBACK19-NEXT: movl 4(%eax), %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%eax), %edi
+; FALLBACK19-NEXT: movl 12(%eax), %edx
+; FALLBACK19-NEXT: movl %edx, %ebx
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %esi, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%eax), %edi
+; FALLBACK19-NEXT: movl 20(%eax), %esi
+; FALLBACK19-NEXT: movl %esi, %ebx
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 24(%eax), %edi
+; FALLBACK19-NEXT: movl 28(%eax), %edx
+; FALLBACK19-NEXT: movl %edx, %ebx
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %esi, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%eax), %edi
+; FALLBACK19-NEXT: movl 36(%eax), %esi
+; FALLBACK19-NEXT: movl %esi, %ebx
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%eax), %ebx
+; FALLBACK19-NEXT: movl 44(%eax), %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK19-NEXT: movl 56(%eax), %edx
+; FALLBACK19-NEXT: movl 60(%eax), %edi
+; FALLBACK19-NEXT: shldl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl (%eax), %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 52(%eax), %esi
+; FALLBACK19-NEXT: shldl %cl, %esi, %edx
+; FALLBACK19-NEXT: negl %ebp
+; FALLBACK19-NEXT: movl 176(%esp,%ebp), %ebp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: movl %edx, 56(%eax)
+; FALLBACK19-NEXT: movl %edi, 60(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: shlxl %ecx, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK19-NEXT: shldl %cl, %edx, %edi
+; FALLBACK19-NEXT: shldl %cl, %ebp, %esi
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK19-NEXT: movl %ebp, 48(%eax)
+; FALLBACK19-NEXT: movl %esi, 52(%eax)
+; FALLBACK19-NEXT: movl %ebx, 40(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 44(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 32(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 36(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 24(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 28(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 16(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 20(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 8(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 12(%eax)
+; FALLBACK19-NEXT: movl %edi, 4(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, (%eax)
+; FALLBACK19-NEXT: addl $204, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: shl_64bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $204, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK20-NEXT: movl (%eax), %eax
+; FALLBACK20-NEXT: xorps %xmm4, %xmm4
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: andl $60, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: subl %edx, %ecx
+; FALLBACK20-NEXT: movl (%ecx), %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 4(%ecx), %edx
+; FALLBACK20-NEXT: movl %ecx, %ebp
+; FALLBACK20-NEXT: shll $3, %eax
+; FALLBACK20-NEXT: andl $24, %eax
+; FALLBACK20-NEXT: movl %edx, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %al, %ch
+; FALLBACK20-NEXT: notb %ch
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %esi, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 12(%ebp), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 8(%ebp), %esi
+; FALLBACK20-NEXT: movl %ebp, %edi
+; FALLBACK20-NEXT: movl %esi, %ebp
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %esi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %edi, %ebp
+; FALLBACK20-NEXT: movl 20(%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 16(%edi), %esi
+; FALLBACK20-NEXT: movl %esi, %edx
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %esi, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %ebp, %edx
+; FALLBACK20-NEXT: movl 28(%ebp), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 24(%ebp), %esi
+; FALLBACK20-NEXT: movl %esi, %edi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %ebx, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %esi, %ebp
+; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 36(%edx), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 32(%edx), %esi
+; FALLBACK20-NEXT: movl %edx, %ebp
+; FALLBACK20-NEXT: movl %esi, %edi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %ebx, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %esi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 44(%ebp), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 40(%ebp), %esi
+; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %esi, %edx
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %esi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 52(%ebp), %esi
+; FALLBACK20-NEXT: movl %esi, %edi
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: negl %edx
+; FALLBACK20-NEXT: movl 176(%esp,%edx), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %edi, %ebp
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT: movl 60(%edi), %edx
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: movl 56(%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %edi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %edx, %edi
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: shrl %esi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %edx, (%eax)
+; FALLBACK20-NEXT: movl %esi, 56(%eax)
+; FALLBACK20-NEXT: movl %edi, 60(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 48(%eax)
+; FALLBACK20-NEXT: movl %ebp, 52(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 40(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 44(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 32(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 36(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 24(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 28(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 16(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 4(%eax)
+; FALLBACK20-NEXT: addl $204, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: shl_64bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $188, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movups (%ecx), %xmm0
+; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK21-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK21-NEXT: movl (%eax), %ecx
+; FALLBACK21-NEXT: xorps %xmm4, %xmm4
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %ecx, %ebp
+; FALLBACK21-NEXT: andl $60, %ebp
+; FALLBACK21-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: subl %ebp, %eax
+; FALLBACK21-NEXT: movl 8(%eax), %esi
+; FALLBACK21-NEXT: movl 12(%eax), %edx
+; FALLBACK21-NEXT: shll $3, %ecx
+; FALLBACK21-NEXT: andl $24, %ecx
+; FALLBACK21-NEXT: movl %edx, %edi
+; FALLBACK21-NEXT: shldl %cl, %esi, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 4(%eax), %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %edi, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 16(%eax), %edi
+; FALLBACK21-NEXT: movl 20(%eax), %esi
+; FALLBACK21-NEXT: movl %esi, %ebx
+; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 24(%eax), %edi
+; FALLBACK21-NEXT: movl 28(%eax), %edx
+; FALLBACK21-NEXT: movl %edx, %ebx
+; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %esi, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 32(%eax), %edi
+; FALLBACK21-NEXT: movl 36(%eax), %esi
+; FALLBACK21-NEXT: movl %esi, %ebx
+; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 40(%eax), %edx
+; FALLBACK21-NEXT: movl 44(%eax), %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT: movl 56(%eax), %edx
+; FALLBACK21-NEXT: movl 60(%eax), %edi
+; FALLBACK21-NEXT: shldl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl (%eax), %ebx
+; FALLBACK21-NEXT: movl 52(%eax), %esi
+; FALLBACK21-NEXT: shldl %cl, %esi, %edx
+; FALLBACK21-NEXT: negl %ebp
+; FALLBACK21-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %edx, 56(%ebp)
+; FALLBACK21-NEXT: movl %edi, 60(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK21-NEXT: shll %cl, %ebx
+; FALLBACK21-NEXT: shldl %cl, %eax, %esi
+; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK21-NEXT: shldl %cl, %edi, %eax
+; FALLBACK21-NEXT: movl %eax, 48(%ebp)
+; FALLBACK21-NEXT: movl %esi, 52(%ebp)
+; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 40(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 44(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 32(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 36(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 24(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %ebx, (%ebp)
+; FALLBACK21-NEXT: movl %edx, 4(%ebp)
+; FALLBACK21-NEXT: addl $188, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: shl_64bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $204, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK22-NEXT: movl (%eax), %eax
+; FALLBACK22-NEXT: xorps %xmm4, %xmm4
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: leal (,%eax,8), %edx
+; FALLBACK22-NEXT: andl $24, %edx
+; FALLBACK22-NEXT: andl $60, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edi
+; FALLBACK22-NEXT: subl %eax, %edi
+; FALLBACK22-NEXT: movl (%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 4(%edi), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl %edx, %ebx
+; FALLBACK22-NEXT: notb %bl
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %esi
+; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 8(%edi), %esi
+; FALLBACK22-NEXT: movl %esi, %ecx
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT: movl 12(%edi), %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: shrl %eax
+; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 16(%edi), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %eax
+; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT: movl 20(%edi), %esi
+; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %eax, %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 24(%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT: movl 28(%edi), %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %eax, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 32(%edi), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %eax
+; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT: movl 36(%edi), %esi
+; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %eax, %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 40(%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT: movl 44(%edi), %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %eax, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 48(%edi), %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax
+; FALLBACK22-NEXT: movl 52(%edi), %esi
+; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ebp
+; FALLBACK22-NEXT: orl %eax, %ebp
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: negl %eax
+; FALLBACK22-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK22-NEXT: movl 56(%edi), %eax
+; FALLBACK22-NEXT: shlxl %edx, %eax, %edx
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %edx, %esi
+; FALLBACK22-NEXT: shrl %eax
+; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT: orl %eax, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK22-NEXT: movl %edx, (%eax)
+; FALLBACK22-NEXT: movl %esi, 56(%eax)
+; FALLBACK22-NEXT: movl %ecx, 60(%eax)
+; FALLBACK22-NEXT: movl %ebp, 48(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 52(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 40(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 44(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 32(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 36(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 24(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 28(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 16(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 20(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 8(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 12(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 4(%eax)
+; FALLBACK22-NEXT: addl $204, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: shl_64bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $204, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movups (%ecx), %xmm0
+; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK23-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK23-NEXT: movl (%eax), %ebp
+; FALLBACK23-NEXT: xorps %xmm4, %xmm4
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: leal (,%ebp,8), %ecx
+; FALLBACK23-NEXT: andl $24, %ecx
+; FALLBACK23-NEXT: andl $60, %ebp
+; FALLBACK23-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: subl %ebp, %eax
+; FALLBACK23-NEXT: movl 4(%eax), %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 8(%eax), %edi
+; FALLBACK23-NEXT: movl 12(%eax), %edx
+; FALLBACK23-NEXT: movl %edx, %ebx
+; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %esi, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 16(%eax), %edi
+; FALLBACK23-NEXT: movl 20(%eax), %esi
+; FALLBACK23-NEXT: movl %esi, %ebx
+; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 24(%eax), %edi
+; FALLBACK23-NEXT: movl 28(%eax), %edx
+; FALLBACK23-NEXT: movl %edx, %ebx
+; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %esi, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 32(%eax), %edi
+; FALLBACK23-NEXT: movl 36(%eax), %esi
+; FALLBACK23-NEXT: movl %esi, %ebx
+; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 40(%eax), %ebx
+; FALLBACK23-NEXT: movl 44(%eax), %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK23-NEXT: movl 56(%eax), %edx
+; FALLBACK23-NEXT: movl 60(%eax), %edi
+; FALLBACK23-NEXT: shldl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl (%eax), %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 52(%eax), %esi
+; FALLBACK23-NEXT: shldl %cl, %esi, %edx
+; FALLBACK23-NEXT: negl %ebp
+; FALLBACK23-NEXT: movl 176(%esp,%ebp), %ebp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl %edx, 56(%eax)
+; FALLBACK23-NEXT: movl %edi, 60(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT: shlxl %ecx, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK23-NEXT: shldl %cl, %edx, %edi
+; FALLBACK23-NEXT: shldl %cl, %ebp, %esi
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK23-NEXT: movl %ebp, 48(%eax)
+; FALLBACK23-NEXT: movl %esi, 52(%eax)
+; FALLBACK23-NEXT: movl %ebx, 40(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 44(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 32(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 36(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 24(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 28(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 16(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 20(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 8(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 12(%eax)
+; FALLBACK23-NEXT: movl %edi, 4(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, (%eax)
+; FALLBACK23-NEXT: addl $204, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: shl_64bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $204, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK24-NEXT: movl (%eax), %eax
+; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: andl $60, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: subl %edx, %ecx
+; FALLBACK24-NEXT: movl (%ecx), %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 4(%ecx), %edx
+; FALLBACK24-NEXT: movl %ecx, %ebp
+; FALLBACK24-NEXT: shll $3, %eax
+; FALLBACK24-NEXT: andl $24, %eax
+; FALLBACK24-NEXT: movl %edx, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %al, %ch
+; FALLBACK24-NEXT: notb %ch
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %esi, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 12(%ebp), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 8(%ebp), %esi
+; FALLBACK24-NEXT: movl %ebp, %edi
+; FALLBACK24-NEXT: movl %esi, %ebp
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %esi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %edi, %ebp
+; FALLBACK24-NEXT: movl 20(%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 16(%edi), %esi
+; FALLBACK24-NEXT: movl %esi, %edx
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %esi, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %ebp, %edx
+; FALLBACK24-NEXT: movl 28(%ebp), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 24(%ebp), %esi
+; FALLBACK24-NEXT: movl %esi, %edi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %ebx, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %esi, %ebp
+; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 36(%edx), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 32(%edx), %esi
+; FALLBACK24-NEXT: movl %edx, %ebp
+; FALLBACK24-NEXT: movl %esi, %edi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %ebx, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %esi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 44(%ebp), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 40(%ebp), %esi
+; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %esi, %edx
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %esi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 52(%ebp), %esi
+; FALLBACK24-NEXT: movl %esi, %edi
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: negl %edx
+; FALLBACK24-NEXT: movl 176(%esp,%edx), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %edi, %ebp
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT: movl 60(%edi), %edx
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: movl 56(%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %edi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %edx, %edi
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: shrl %esi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %edx, (%eax)
+; FALLBACK24-NEXT: movl %esi, 56(%eax)
+; FALLBACK24-NEXT: movl %edi, 60(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 48(%eax)
+; FALLBACK24-NEXT: movl %ebp, 52(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 40(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 44(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 32(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 36(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 24(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 28(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 16(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 4(%eax)
+; FALLBACK24-NEXT: addl $204, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: shl_64bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $188, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK25-NEXT: movl (%eax), %ecx
+; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %ecx, %ebp
+; FALLBACK25-NEXT: andl $60, %ebp
+; FALLBACK25-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: subl %ebp, %eax
+; FALLBACK25-NEXT: movl 8(%eax), %esi
+; FALLBACK25-NEXT: movl 12(%eax), %edx
+; FALLBACK25-NEXT: shll $3, %ecx
+; FALLBACK25-NEXT: andl $24, %ecx
+; FALLBACK25-NEXT: movl %edx, %edi
+; FALLBACK25-NEXT: shldl %cl, %esi, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 4(%eax), %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %edi, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 16(%eax), %edi
+; FALLBACK25-NEXT: movl 20(%eax), %esi
+; FALLBACK25-NEXT: movl %esi, %ebx
+; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 24(%eax), %edi
+; FALLBACK25-NEXT: movl 28(%eax), %edx
+; FALLBACK25-NEXT: movl %edx, %ebx
+; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %esi, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 32(%eax), %edi
+; FALLBACK25-NEXT: movl 36(%eax), %esi
+; FALLBACK25-NEXT: movl %esi, %ebx
+; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 40(%eax), %edx
+; FALLBACK25-NEXT: movl 44(%eax), %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT: movl 56(%eax), %edx
+; FALLBACK25-NEXT: movl 60(%eax), %edi
+; FALLBACK25-NEXT: shldl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl (%eax), %ebx
+; FALLBACK25-NEXT: movl 52(%eax), %esi
+; FALLBACK25-NEXT: shldl %cl, %esi, %edx
+; FALLBACK25-NEXT: negl %ebp
+; FALLBACK25-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %edx, 56(%ebp)
+; FALLBACK25-NEXT: movl %edi, 60(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK25-NEXT: shll %cl, %ebx
+; FALLBACK25-NEXT: shldl %cl, %eax, %esi
+; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK25-NEXT: shldl %cl, %edi, %eax
+; FALLBACK25-NEXT: movl %eax, 48(%ebp)
+; FALLBACK25-NEXT: movl %esi, 52(%ebp)
+; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 40(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 44(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 32(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 36(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 24(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %ebx, (%ebp)
+; FALLBACK25-NEXT: movl %edx, 4(%ebp)
+; FALLBACK25-NEXT: addl $188, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: shl_64bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $204, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK26-NEXT: movl (%eax), %eax
+; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: leal (,%eax,8), %edx
+; FALLBACK26-NEXT: andl $24, %edx
+; FALLBACK26-NEXT: andl $60, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edi
+; FALLBACK26-NEXT: subl %eax, %edi
+; FALLBACK26-NEXT: movl (%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 4(%edi), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl %edx, %ebx
+; FALLBACK26-NEXT: notb %bl
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %esi
+; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx
+; FALLBACK26-NEXT: orl %ecx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 8(%edi), %esi
+; FALLBACK26-NEXT: movl %esi, %ecx
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT: movl 12(%edi), %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: shrl %eax
+; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 16(%edi), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %eax
+; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT: movl 20(%edi), %esi
+; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %eax, %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 24(%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT: movl 28(%edi), %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %eax, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 32(%edi), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %eax
+; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT: movl 36(%edi), %esi
+; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %eax, %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 40(%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT: movl 44(%edi), %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %eax, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 48(%edi), %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax
+; FALLBACK26-NEXT: movl 52(%edi), %esi
+; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ebp
+; FALLBACK26-NEXT: orl %eax, %ebp
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: negl %eax
+; FALLBACK26-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK26-NEXT: movl 56(%edi), %eax
+; FALLBACK26-NEXT: shlxl %edx, %eax, %edx
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %edx, %esi
+; FALLBACK26-NEXT: shrl %eax
+; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT: orl %eax, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK26-NEXT: movl %edx, (%eax)
+; FALLBACK26-NEXT: movl %esi, 56(%eax)
+; FALLBACK26-NEXT: movl %ecx, 60(%eax)
+; FALLBACK26-NEXT: movl %ebp, 48(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 52(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 40(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 44(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 32(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 36(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 24(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 28(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 16(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 20(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 8(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 12(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 4(%eax)
+; FALLBACK26-NEXT: addl $204, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: shl_64bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $204, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK27-NEXT: movl (%eax), %ebx
+; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: leal (,%ebx,8), %ecx
+; FALLBACK27-NEXT: andl $24, %ecx
+; FALLBACK27-NEXT: andl $60, %ebx
+; FALLBACK27-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: subl %ebx, %eax
+; FALLBACK27-NEXT: movl 4(%eax), %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 8(%eax), %edi
+; FALLBACK27-NEXT: movl 12(%eax), %edx
+; FALLBACK27-NEXT: movl %edx, %ebp
+; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %esi, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 16(%eax), %edi
+; FALLBACK27-NEXT: movl 20(%eax), %esi
+; FALLBACK27-NEXT: movl %esi, %ebp
+; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 24(%eax), %edi
+; FALLBACK27-NEXT: movl 28(%eax), %edx
+; FALLBACK27-NEXT: movl %edx, %ebp
+; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %esi, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 32(%eax), %edi
+; FALLBACK27-NEXT: movl 36(%eax), %esi
+; FALLBACK27-NEXT: movl %esi, %ebp
+; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 40(%eax), %ebp
+; FALLBACK27-NEXT: movl 44(%eax), %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %ebp, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %esi, %ebp
+; FALLBACK27-NEXT: movl 56(%eax), %edx
+; FALLBACK27-NEXT: movl 60(%eax), %edi
+; FALLBACK27-NEXT: shldl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl (%eax), %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 52(%eax), %esi
+; FALLBACK27-NEXT: shldl %cl, %esi, %edx
+; FALLBACK27-NEXT: negl %ebx
+; FALLBACK27-NEXT: movl 176(%esp,%ebx), %ebx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl %edx, 56(%eax)
+; FALLBACK27-NEXT: movl %edi, 60(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT: shlxl %ecx, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK27-NEXT: shldl %cl, %edx, %edi
+; FALLBACK27-NEXT: shldl %cl, %ebx, %esi
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK27-NEXT: movl %ebx, 48(%eax)
+; FALLBACK27-NEXT: movl %esi, 52(%eax)
+; FALLBACK27-NEXT: movl %ebp, 40(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 44(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 32(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 36(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 24(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 28(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 16(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 20(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 8(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 12(%eax)
+; FALLBACK27-NEXT: movl %edi, 4(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, (%eax)
+; FALLBACK27-NEXT: addl $204, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: shl_64bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $204, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK28-NEXT: movl (%eax), %eax
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: andl $60, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: subl %edx, %ecx
+; FALLBACK28-NEXT: movl (%ecx), %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 4(%ecx), %edx
+; FALLBACK28-NEXT: movl %ecx, %ebp
+; FALLBACK28-NEXT: shll $3, %eax
+; FALLBACK28-NEXT: andl $24, %eax
+; FALLBACK28-NEXT: movl %edx, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %al, %ch
+; FALLBACK28-NEXT: notb %ch
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %esi, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 12(%ebp), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 8(%ebp), %esi
+; FALLBACK28-NEXT: movl %ebp, %edi
+; FALLBACK28-NEXT: movl %esi, %ebp
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %esi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %edi, %ebp
+; FALLBACK28-NEXT: movl 20(%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 16(%edi), %esi
+; FALLBACK28-NEXT: movl %esi, %edx
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %esi, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %ebp, %edx
+; FALLBACK28-NEXT: movl 28(%ebp), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 24(%ebp), %esi
+; FALLBACK28-NEXT: movl %esi, %edi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %ebx, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %esi, %ebp
+; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 36(%edx), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 32(%edx), %esi
+; FALLBACK28-NEXT: movl %edx, %ebp
+; FALLBACK28-NEXT: movl %esi, %edi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %ebx, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %esi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 44(%ebp), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 40(%ebp), %esi
+; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %esi, %edx
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %esi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 52(%ebp), %esi
+; FALLBACK28-NEXT: movl %esi, %edi
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: negl %edx
+; FALLBACK28-NEXT: movl 176(%esp,%edx), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %edi, %ebp
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT: movl 60(%edi), %edx
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: movl 56(%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %edi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %edx, %edi
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: shrl %esi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %edx, (%eax)
+; FALLBACK28-NEXT: movl %esi, 56(%eax)
+; FALLBACK28-NEXT: movl %edi, 60(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 48(%eax)
+; FALLBACK28-NEXT: movl %ebp, 52(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 40(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 44(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 32(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 36(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 24(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 28(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 16(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 4(%eax)
+; FALLBACK28-NEXT: addl $204, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: shl_64bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $188, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK29-NEXT: movl (%eax), %ecx
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %ecx, %ebp
+; FALLBACK29-NEXT: andl $60, %ebp
+; FALLBACK29-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: subl %ebp, %eax
+; FALLBACK29-NEXT: movl 8(%eax), %esi
+; FALLBACK29-NEXT: movl 12(%eax), %edx
+; FALLBACK29-NEXT: shll $3, %ecx
+; FALLBACK29-NEXT: andl $24, %ecx
+; FALLBACK29-NEXT: movl %edx, %edi
+; FALLBACK29-NEXT: shldl %cl, %esi, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 4(%eax), %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %edi, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 16(%eax), %edi
+; FALLBACK29-NEXT: movl 20(%eax), %esi
+; FALLBACK29-NEXT: movl %esi, %ebx
+; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 24(%eax), %edi
+; FALLBACK29-NEXT: movl 28(%eax), %edx
+; FALLBACK29-NEXT: movl %edx, %ebx
+; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %esi, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 32(%eax), %edi
+; FALLBACK29-NEXT: movl 36(%eax), %esi
+; FALLBACK29-NEXT: movl %esi, %ebx
+; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 40(%eax), %edx
+; FALLBACK29-NEXT: movl 44(%eax), %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT: movl 56(%eax), %edx
+; FALLBACK29-NEXT: movl 60(%eax), %edi
+; FALLBACK29-NEXT: shldl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl (%eax), %ebx
+; FALLBACK29-NEXT: movl 52(%eax), %esi
+; FALLBACK29-NEXT: shldl %cl, %esi, %edx
+; FALLBACK29-NEXT: negl %ebp
+; FALLBACK29-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %edx, 56(%ebp)
+; FALLBACK29-NEXT: movl %edi, 60(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK29-NEXT: shll %cl, %ebx
+; FALLBACK29-NEXT: shldl %cl, %eax, %esi
+; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK29-NEXT: shldl %cl, %edi, %eax
+; FALLBACK29-NEXT: movl %eax, 48(%ebp)
+; FALLBACK29-NEXT: movl %esi, 52(%ebp)
+; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 40(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 44(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 32(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 36(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 24(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %ebx, (%ebp)
+; FALLBACK29-NEXT: movl %edx, 4(%ebp)
+; FALLBACK29-NEXT: addl $188, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: shl_64bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $204, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK30-NEXT: movl (%eax), %eax
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: leal (,%eax,8), %edx
+; FALLBACK30-NEXT: andl $24, %edx
+; FALLBACK30-NEXT: andl $60, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edi
+; FALLBACK30-NEXT: subl %eax, %edi
+; FALLBACK30-NEXT: movl (%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 4(%edi), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl %edx, %ebx
+; FALLBACK30-NEXT: notb %bl
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %esi
+; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx
+; FALLBACK30-NEXT: orl %ecx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 8(%edi), %esi
+; FALLBACK30-NEXT: movl %esi, %ecx
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT: movl 12(%edi), %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: shrl %eax
+; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 16(%edi), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %eax
+; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT: movl 20(%edi), %esi
+; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %eax, %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 24(%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT: movl 28(%edi), %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %eax, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 32(%edi), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %eax
+; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT: movl 36(%edi), %esi
+; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %eax, %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 40(%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT: movl 44(%edi), %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %eax, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 48(%edi), %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax
+; FALLBACK30-NEXT: movl 52(%edi), %esi
+; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ebp
+; FALLBACK30-NEXT: orl %eax, %ebp
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: negl %eax
+; FALLBACK30-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK30-NEXT: movl 56(%edi), %eax
+; FALLBACK30-NEXT: shlxl %edx, %eax, %edx
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %edx, %esi
+; FALLBACK30-NEXT: shrl %eax
+; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT: orl %eax, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK30-NEXT: movl %edx, (%eax)
+; FALLBACK30-NEXT: movl %esi, 56(%eax)
+; FALLBACK30-NEXT: movl %ecx, 60(%eax)
+; FALLBACK30-NEXT: movl %ebp, 48(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 52(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 40(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 44(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 32(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 36(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 24(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 28(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 16(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 20(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 8(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 12(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 4(%eax)
+; FALLBACK30-NEXT: addl $204, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: shl_64bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $204, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK31-NEXT: movl (%eax), %ebx
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: leal (,%ebx,8), %ecx
+; FALLBACK31-NEXT: andl $24, %ecx
+; FALLBACK31-NEXT: andl $60, %ebx
+; FALLBACK31-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: subl %ebx, %eax
+; FALLBACK31-NEXT: movl 4(%eax), %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 8(%eax), %edi
+; FALLBACK31-NEXT: movl 12(%eax), %edx
+; FALLBACK31-NEXT: movl %edx, %ebp
+; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %esi, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 16(%eax), %edi
+; FALLBACK31-NEXT: movl 20(%eax), %esi
+; FALLBACK31-NEXT: movl %esi, %ebp
+; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 24(%eax), %edi
+; FALLBACK31-NEXT: movl 28(%eax), %edx
+; FALLBACK31-NEXT: movl %edx, %ebp
+; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %esi, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 32(%eax), %edi
+; FALLBACK31-NEXT: movl 36(%eax), %esi
+; FALLBACK31-NEXT: movl %esi, %ebp
+; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 40(%eax), %ebp
+; FALLBACK31-NEXT: movl 44(%eax), %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %ebp, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %esi, %ebp
+; FALLBACK31-NEXT: movl 56(%eax), %edx
+; FALLBACK31-NEXT: movl 60(%eax), %edi
+; FALLBACK31-NEXT: shldl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl (%eax), %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 52(%eax), %esi
+; FALLBACK31-NEXT: shldl %cl, %esi, %edx
+; FALLBACK31-NEXT: negl %ebx
+; FALLBACK31-NEXT: movl 176(%esp,%ebx), %ebx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl %edx, 56(%eax)
+; FALLBACK31-NEXT: movl %edi, 60(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT: shlxl %ecx, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK31-NEXT: shldl %cl, %edx, %edi
+; FALLBACK31-NEXT: shldl %cl, %ebx, %esi
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK31-NEXT: movl %ebx, 48(%eax)
+; FALLBACK31-NEXT: movl %esi, 52(%eax)
+; FALLBACK31-NEXT: movl %ebp, 40(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 44(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 32(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 36(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 24(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 28(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 16(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 20(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 8(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 12(%eax)
+; FALLBACK31-NEXT: movl %edi, 4(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, (%eax)
+; FALLBACK31-NEXT: addl $204, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
+ %src = load i512, ptr %src.ptr, align 1
+ %byteOff = load i512, ptr %byteOff.ptr, align 1
+ %bitOff = shl i512 %byteOff, 3
+ %res = shl i512 %src, %bitOff
+ store i512 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: shl_64bytes_qwordOff:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pushq %rbx
; X64-SSE2-NEXT: movq (%rdi), %rax
@@ -2012,6 +19811,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: movq 48(%rdi), %rbx
; X64-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-SSE2-NEXT: movl (%rsi), %esi
+; X64-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -2020,15 +19824,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $63, %esi
+; X64-SSE2-NEXT: shll $3, %esi
+; X64-SSE2-NEXT: andl $56, %esi
; X64-SSE2-NEXT: negl %esi
; X64-SSE2-NEXT: movslq %esi, %rax
; X64-SSE2-NEXT: movq -64(%rsp,%rax), %rcx
@@ -2050,23 +19847,25 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: popq %rbx
; X64-SSE2-NEXT: retq
;
-; X64-SSE42-LABEL: shl_64bytes:
+; X64-SSE42-LABEL: shl_64bytes_qwordOff:
; X64-SSE42: # %bb.0:
+; X64-SSE42-NEXT: pushq %rax
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
; X64-SSE42-NEXT: movups 48(%rdi), %xmm3
; X64-SSE42-NEXT: movl (%rsi), %eax
; X64-SSE42-NEXT: xorps %xmm4, %xmm4
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm3, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $63, %eax
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: shll $3, %eax
+; X64-SSE42-NEXT: andl $56, %eax
; X64-SSE42-NEXT: negl %eax
; X64-SSE42-NEXT: cltq
; X64-SSE42-NEXT: movups -64(%rsp,%rax), %xmm0
@@ -2077,10 +19876,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
+; X64-SSE42-NEXT: popq %rax
; X64-SSE42-NEXT: retq
;
-; X64-AVX1-LABEL: shl_64bytes:
+; X64-AVX1-LABEL: shl_64bytes_qwordOff:
; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: pushq %rax
; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
; X64-AVX1-NEXT: movl (%rsi), %eax
@@ -2089,7 +19890,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: andl $63, %eax
+; X64-AVX1-NEXT: shll $3, %eax
+; X64-AVX1-NEXT: andl $56, %eax
; X64-AVX1-NEXT: negl %eax
; X64-AVX1-NEXT: cltq
; X64-AVX1-NEXT: vmovups -64(%rsp,%rax), %xmm0
@@ -2100,17 +19902,20 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX1-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX1-NEXT: popq %rax
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
-; X64-AVX512-LABEL: shl_64bytes:
+; X64-AVX512-LABEL: shl_64bytes_qwordOff:
; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: pushq %rax
; X64-AVX512-NEXT: vmovups (%rdi), %zmm0
; X64-AVX512-NEXT: movl (%rsi), %eax
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: andl $63, %eax
+; X64-AVX512-NEXT: shll $3, %eax
+; X64-AVX512-NEXT: andl $56, %eax
; X64-AVX512-NEXT: negl %eax
; X64-AVX512-NEXT: cltq
; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %xmm0
@@ -2121,117 +19926,108 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX512-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX512-NEXT: popq %rax
; X64-AVX512-NEXT: vzeroupper
; X64-AVX512-NEXT: retq
;
-; X86-SSE2-LABEL: shl_64bytes:
+; X86-SSE2-LABEL: shl_64bytes_qwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $168, %esp
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl (%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 4(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 8(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 12(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 16(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 20(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 24(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 28(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 32(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 36(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 40(%eax), %ebp
-; X86-SSE2-NEXT: movl 44(%eax), %ebx
-; X86-SSE2-NEXT: movl 48(%eax), %edi
-; X86-SSE2-NEXT: movl 52(%eax), %esi
-; X86-SSE2-NEXT: movl 56(%eax), %edx
-; X86-SSE2-NEXT: movl 60(%eax), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl (%eax), %eax
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: subl $188, %esp
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: movl (%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 4(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 8(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 12(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 16(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 20(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 24(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 28(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 32(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 36(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 40(%ecx), %ebp
+; X86-SSE2-NEXT: movl 44(%ecx), %ebx
+; X86-SSE2-NEXT: movl 48(%ecx), %edi
+; X86-SSE2-NEXT: movl 52(%ecx), %esi
+; X86-SSE2-NEXT: movl 56(%ecx), %edx
+; X86-SSE2-NEXT: movl 60(%ecx), %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: movl (%ecx), %ecx
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $63, %eax
-; X86-SSE2-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT: subl %eax, %ecx
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%ecx), %edx
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: shll $3, %ecx
+; X86-SSE2-NEXT: andl $56, %ecx
+; X86-SSE2-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: subl %ecx, %eax
+; X86-SSE2-NEXT: movl (%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 4(%ecx), %edx
+; X86-SSE2-NEXT: movl 4(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 12(%ecx), %edx
+; X86-SSE2-NEXT: movl 12(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 8(%ecx), %edx
+; X86-SSE2-NEXT: movl 8(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 20(%ecx), %edx
+; X86-SSE2-NEXT: movl 20(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 16(%ecx), %edx
+; X86-SSE2-NEXT: movl 16(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 28(%ecx), %edx
+; X86-SSE2-NEXT: movl 28(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 24(%ecx), %edx
+; X86-SSE2-NEXT: movl 24(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 36(%ecx), %edx
+; X86-SSE2-NEXT: movl 36(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 32(%ecx), %edx
-; X86-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 44(%ecx), %ebp
-; X86-SSE2-NEXT: movl 40(%ecx), %ebx
-; X86-SSE2-NEXT: movl 52(%ecx), %edi
-; X86-SSE2-NEXT: movl 60(%ecx), %esi
-; X86-SSE2-NEXT: movl 56(%ecx), %edx
-; X86-SSE2-NEXT: negl %eax
-; X86-SSE2-NEXT: movl 152(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 32(%eax), %edx
+; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 44(%eax), %ebp
+; X86-SSE2-NEXT: movl 40(%eax), %ebx
+; X86-SSE2-NEXT: movl 52(%eax), %edi
+; X86-SSE2-NEXT: movl 60(%eax), %esi
+; X86-SSE2-NEXT: movl 56(%eax), %edx
+; X86-SSE2-NEXT: negl %ecx
+; X86-SSE2-NEXT: movl 160(%esp,%ecx), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl %edx, 56(%eax)
; X86-SSE2-NEXT: movl %esi, 60(%eax)
@@ -2239,7 +20035,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %edi, 52(%eax)
; X86-SSE2-NEXT: movl %ebx, 40(%eax)
; X86-SSE2-NEXT: movl %ebp, 44(%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 32(%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 36(%eax)
@@ -2259,16 +20055,16 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ecx, (%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $168, %esp
+; X86-SSE2-NEXT: addl $188, %esp
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: shl_64bytes:
+; X86-SSE42-LABEL: shl_64bytes_qwordOff:
; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $128, %esp
+; X86-SSE42-NEXT: subl $140, %esp
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -2278,15 +20074,16 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movups 48(%edx), %xmm3
; X86-SSE42-NEXT: movl (%ecx), %ecx
; X86-SSE42-NEXT: xorps %xmm4, %xmm4
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, (%esp)
-; X86-SSE42-NEXT: movups %xmm3, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andl $63, %ecx
+; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm4, (%esp)
+; X86-SSE42-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: shll $3, %ecx
+; X86-SSE42-NEXT: andl $56, %ecx
; X86-SSE42-NEXT: leal {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT: subl %ecx, %edx
; X86-SSE42-NEXT: movups (%edx), %xmm0
@@ -2298,12 +20095,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movups %xmm2, 32(%eax)
; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $128, %esp
+; X86-SSE42-NEXT: addl $140, %esp
; X86-SSE42-NEXT: retl
;
-; X86-AVX1-LABEL: shl_64bytes:
+; X86-AVX1-LABEL: shl_64bytes_qwordOff:
; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: subl $128, %esp
+; X86-AVX1-NEXT: subl $140, %esp
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -2315,7 +20112,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX1-NEXT: vmovups %ymm2, (%esp)
; X86-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
; X86-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: andl $63, %ecx
+; X86-AVX1-NEXT: shll $3, %ecx
+; X86-AVX1-NEXT: andl $56, %ecx
; X86-AVX1-NEXT: leal {{[0-9]+}}(%esp), %edx
; X86-AVX1-NEXT: subl %ecx, %edx
; X86-AVX1-NEXT: vmovups (%edx), %xmm0
@@ -2327,13 +20125,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax)
; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax)
; X86-AVX1-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX1-NEXT: addl $128, %esp
+; X86-AVX1-NEXT: addl $140, %esp
; X86-AVX1-NEXT: vzeroupper
; X86-AVX1-NEXT: retl
;
-; X86-AVX512-LABEL: shl_64bytes:
+; X86-AVX512-LABEL: shl_64bytes_qwordOff:
; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: subl $128, %esp
+; X86-AVX512-NEXT: subl $140, %esp
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -2342,7 +20140,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-AVX512-NEXT: vmovups %zmm1, (%esp)
; X86-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-AVX512-NEXT: andl $63, %ecx
+; X86-AVX512-NEXT: shll $3, %ecx
+; X86-AVX512-NEXT: andl $56, %ecx
; X86-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx
; X86-AVX512-NEXT: subl %ecx, %edx
; X86-AVX512-NEXT: vmovups (%edx), %xmm0
@@ -2354,18 +20153,4121 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax)
; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax)
; X86-AVX512-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT: addl $128, %esp
+; X86-AVX512-NEXT: addl $140, %esp
; X86-AVX512-NEXT: vzeroupper
; X86-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
- %byteOff = load i512, ptr %byteOff.ptr, align 1
- %bitOff = shl i512 %byteOff, 3
+ %qwordOff = load i512, ptr %qwordOff.ptr, align 1
+ %bitOff = shl i512 %qwordOff, 6
%res = shl i512 %src, %bitOff
store i512 %res, ptr %dst, align 1
ret void
}
+
define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: ashr_64bytes:
+; FALLBACK0-LABEL: ashr_64bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %r15
+; FALLBACK0-NEXT: pushq %r14
+; FALLBACK0-NEXT: pushq %r13
+; FALLBACK0-NEXT: pushq %r12
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rax
+; FALLBACK0-NEXT: movq 8(%rdi), %rcx
+; FALLBACK0-NEXT: movq 16(%rdi), %r8
+; FALLBACK0-NEXT: movq 24(%rdi), %r9
+; FALLBACK0-NEXT: movq 32(%rdi), %r10
+; FALLBACK0-NEXT: movq 40(%rdi), %r11
+; FALLBACK0-NEXT: movq 48(%rdi), %rbx
+; FALLBACK0-NEXT: movq 56(%rdi), %r14
+; FALLBACK0-NEXT: movl (%rsi), %edi
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: sarq $63, %r14
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: leal (,%rdi,8), %eax
+; FALLBACK0-NEXT: andl $56, %eax
+; FALLBACK0-NEXT: andl $56, %edi
+; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10
+; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8
+; FALLBACK0-NEXT: movq %r8, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r9
+; FALLBACK0-NEXT: orq %r11, %r9
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: addq %r8, %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: orq %r10, %r8
+; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10
+; FALLBACK0-NEXT: movq %r10, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r15
+; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14
+; FALLBACK0-NEXT: leaq (%r14,%r14), %r11
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: orq %r15, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: addq %r10, %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %rbx, %r10
+; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT: movq %rbx, %r12
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r12
+; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13
+; FALLBACK0-NEXT: leaq (%r13,%r13), %r15
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r15
+; FALLBACK0-NEXT: orq %r12, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r14
+; FALLBACK0-NEXT: addq %rbx, %rbx
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rbx
+; FALLBACK0-NEXT: orq %r14, %rbx
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r13
+; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r14
+; FALLBACK0-NEXT: orq %r13, %r14
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: sarq %cl, %rdi
+; FALLBACK0-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK0-NEXT: movq %r14, 48(%rdx)
+; FALLBACK0-NEXT: movq %rbx, 32(%rdx)
+; FALLBACK0-NEXT: movq %r15, 40(%rdx)
+; FALLBACK0-NEXT: movq %r10, 16(%rdx)
+; FALLBACK0-NEXT: movq %r11, 24(%rdx)
+; FALLBACK0-NEXT: movq %r8, (%rdx)
+; FALLBACK0-NEXT: movq %r9, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: popq %r12
+; FALLBACK0-NEXT: popq %r13
+; FALLBACK0-NEXT: popq %r14
+; FALLBACK0-NEXT: popq %r15
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: ashr_64bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: pushq %r15
+; FALLBACK1-NEXT: pushq %r14
+; FALLBACK1-NEXT: pushq %rbx
+; FALLBACK1-NEXT: movq (%rdi), %rcx
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %r10
+; FALLBACK1-NEXT: movq 32(%rdi), %r11
+; FALLBACK1-NEXT: movq 40(%rdi), %rbx
+; FALLBACK1-NEXT: movq 48(%rdi), %r14
+; FALLBACK1-NEXT: movq 56(%rdi), %rdi
+; FALLBACK1-NEXT: movl (%rsi), %eax
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: sarq $63, %rdi
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: leal (,%rax,8), %ecx
+; FALLBACK1-NEXT: andl $56, %ecx
+; FALLBACK1-NEXT: andl $56, %eax
+; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9
+; FALLBACK1-NEXT: movq %r9, %r8
+; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8
+; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11
+; FALLBACK1-NEXT: movq %r11, %rbx
+; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx
+; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11
+; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK1-NEXT: movq %r14, %r15
+; FALLBACK1-NEXT: shrdq %cl, %r11, %r15
+; FALLBACK1-NEXT: shrdq %cl, %r14, %r10
+; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK1-NEXT: shrdq %cl, %rax, %r11
+; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: sarq %cl, %rax
+; FALLBACK1-NEXT: movq %r11, 48(%rdx)
+; FALLBACK1-NEXT: movq %rax, 56(%rdx)
+; FALLBACK1-NEXT: movq %r10, 32(%rdx)
+; FALLBACK1-NEXT: movq %r15, 40(%rdx)
+; FALLBACK1-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK1-NEXT: movq %rsi, (%rdx)
+; FALLBACK1-NEXT: movq %r8, 8(%rdx)
+; FALLBACK1-NEXT: popq %rbx
+; FALLBACK1-NEXT: popq %r14
+; FALLBACK1-NEXT: popq %r15
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: ashr_64bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: pushq %rbp
+; FALLBACK2-NEXT: pushq %r15
+; FALLBACK2-NEXT: pushq %r14
+; FALLBACK2-NEXT: pushq %r13
+; FALLBACK2-NEXT: pushq %r12
+; FALLBACK2-NEXT: pushq %rbx
+; FALLBACK2-NEXT: pushq %rax
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %r10
+; FALLBACK2-NEXT: movq 32(%rdi), %r11
+; FALLBACK2-NEXT: movq 40(%rdi), %rbx
+; FALLBACK2-NEXT: movq 48(%rdi), %r14
+; FALLBACK2-NEXT: movq 56(%rdi), %rdi
+; FALLBACK2-NEXT: movl (%rsi), %eax
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: sarq $63, %rdi
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: leal (,%rax,8), %ecx
+; FALLBACK2-NEXT: andl $56, %ecx
+; FALLBACK2-NEXT: andl $56, %eax
+; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi
+; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9
+; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx
+; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13
+; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi
+; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8
+; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11
+; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15
+; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp
+; FALLBACK2-NEXT: movl %ecx, %r12d
+; FALLBACK2-NEXT: notb %r12b
+; FALLBACK2-NEXT: addq %r9, %r9
+; FALLBACK2-NEXT: shlxq %r12, %r9, %r9
+; FALLBACK2-NEXT: orq %rbx, %r9
+; FALLBACK2-NEXT: addq %rdi, %rdi
+; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r13, %rdi
+; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx
+; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13
+; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK2-NEXT: sarxq %rcx, %rax, %rcx
+; FALLBACK2-NEXT: addq %r10, %r10
+; FALLBACK2-NEXT: shlxq %r12, %r10, %r10
+; FALLBACK2-NEXT: orq %r8, %r10
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r11, %rsi
+; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8
+; FALLBACK2-NEXT: shlxq %r12, %r8, %r8
+; FALLBACK2-NEXT: orq %r15, %r8
+; FALLBACK2-NEXT: addq %r14, %r14
+; FALLBACK2-NEXT: shlxq %r12, %r14, %r11
+; FALLBACK2-NEXT: orq %rbp, %r11
+; FALLBACK2-NEXT: addq %rax, %rax
+; FALLBACK2-NEXT: shlxq %r12, %rax, %rax
+; FALLBACK2-NEXT: orq %r13, %rax
+; FALLBACK2-NEXT: movq %rcx, 56(%rdx)
+; FALLBACK2-NEXT: movq %rax, 48(%rdx)
+; FALLBACK2-NEXT: movq %r11, 32(%rdx)
+; FALLBACK2-NEXT: movq %r8, 40(%rdx)
+; FALLBACK2-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK2-NEXT: movq %r10, 24(%rdx)
+; FALLBACK2-NEXT: movq %rdi, (%rdx)
+; FALLBACK2-NEXT: movq %r9, 8(%rdx)
+; FALLBACK2-NEXT: addq $8, %rsp
+; FALLBACK2-NEXT: popq %rbx
+; FALLBACK2-NEXT: popq %r12
+; FALLBACK2-NEXT: popq %r13
+; FALLBACK2-NEXT: popq %r14
+; FALLBACK2-NEXT: popq %r15
+; FALLBACK2-NEXT: popq %rbp
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: ashr_64bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: pushq %r15
+; FALLBACK3-NEXT: pushq %r14
+; FALLBACK3-NEXT: pushq %rbx
+; FALLBACK3-NEXT: movq (%rdi), %rcx
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %r10
+; FALLBACK3-NEXT: movq 32(%rdi), %r11
+; FALLBACK3-NEXT: movq 40(%rdi), %rbx
+; FALLBACK3-NEXT: movq 48(%rdi), %r14
+; FALLBACK3-NEXT: movq 56(%rdi), %rdi
+; FALLBACK3-NEXT: movl (%rsi), %eax
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: sarq $63, %rdi
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: leal (,%rax,8), %ecx
+; FALLBACK3-NEXT: andl $56, %ecx
+; FALLBACK3-NEXT: andl $56, %eax
+; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9
+; FALLBACK3-NEXT: movq %r9, %r8
+; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8
+; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11
+; FALLBACK3-NEXT: movq %r11, %rbx
+; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx
+; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11
+; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK3-NEXT: movq %r14, %r15
+; FALLBACK3-NEXT: shrdq %cl, %r11, %r15
+; FALLBACK3-NEXT: shrdq %cl, %r14, %r10
+; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK3-NEXT: shrdq %cl, %rax, %r11
+; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi
+; FALLBACK3-NEXT: movq %r11, 48(%rdx)
+; FALLBACK3-NEXT: movq %r10, 32(%rdx)
+; FALLBACK3-NEXT: movq %r15, 40(%rdx)
+; FALLBACK3-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK3-NEXT: movq %rsi, (%rdx)
+; FALLBACK3-NEXT: movq %r8, 8(%rdx)
+; FALLBACK3-NEXT: movq %rax, 56(%rdx)
+; FALLBACK3-NEXT: popq %rbx
+; FALLBACK3-NEXT: popq %r14
+; FALLBACK3-NEXT: popq %r15
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: ashr_64bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbp
+; FALLBACK4-NEXT: pushq %r15
+; FALLBACK4-NEXT: pushq %r14
+; FALLBACK4-NEXT: pushq %r13
+; FALLBACK4-NEXT: pushq %r12
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: pushq %rax
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT: movq 48(%rdi), %rax
+; FALLBACK4-NEXT: movq 56(%rdi), %rcx
+; FALLBACK4-NEXT: movl (%rsi), %edi
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: sarq $63, %rcx
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: leal (,%rdi,8), %eax
+; FALLBACK4-NEXT: andl $56, %eax
+; FALLBACK4-NEXT: andl $56, %edi
+; FALLBACK4-NEXT: movq -128(%rsp,%rdi), %r10
+; FALLBACK4-NEXT: movq -120(%rsp,%rdi), %r9
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: leaq (%r9,%r9), %r8
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r8
+; FALLBACK4-NEXT: orq %r10, %r8
+; FALLBACK4-NEXT: movq -104(%rsp,%rdi), %r10
+; FALLBACK4-NEXT: movq %r10, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbx
+; FALLBACK4-NEXT: movq -96(%rsp,%rdi), %r12
+; FALLBACK4-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r11
+; FALLBACK4-NEXT: orq %rbx, %r11
+; FALLBACK4-NEXT: movq -112(%rsp,%rdi), %rbx
+; FALLBACK4-NEXT: movq %rbx, %r14
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r14
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r14, %r10
+; FALLBACK4-NEXT: movq -88(%rsp,%rdi), %r14
+; FALLBACK4-NEXT: movq %r14, %r13
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r13
+; FALLBACK4-NEXT: movq -80(%rsp,%rdi), %rbp
+; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r15
+; FALLBACK4-NEXT: orq %r13, %r15
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r12
+; FALLBACK4-NEXT: addq %r14, %r14
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r14
+; FALLBACK4-NEXT: orq %r12, %r14
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbp
+; FALLBACK4-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK4-NEXT: leaq (%rdi,%rdi), %r12
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r12
+; FALLBACK4-NEXT: orq %rbp, %r12
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r9
+; FALLBACK4-NEXT: addq %rbx, %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r9, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: sarq %cl, %rdi
+; FALLBACK4-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK4-NEXT: movq %r12, 48(%rdx)
+; FALLBACK4-NEXT: movq %r14, 32(%rdx)
+; FALLBACK4-NEXT: movq %r15, 40(%rdx)
+; FALLBACK4-NEXT: movq %r10, 16(%rdx)
+; FALLBACK4-NEXT: movq %r11, 24(%rdx)
+; FALLBACK4-NEXT: movq %r8, (%rdx)
+; FALLBACK4-NEXT: addq $8, %rsp
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: popq %r12
+; FALLBACK4-NEXT: popq %r13
+; FALLBACK4-NEXT: popq %r14
+; FALLBACK4-NEXT: popq %r15
+; FALLBACK4-NEXT: popq %rbp
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: ashr_64bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: pushq %r15
+; FALLBACK5-NEXT: pushq %r14
+; FALLBACK5-NEXT: pushq %rbx
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT: movq 48(%rdi), %rcx
+; FALLBACK5-NEXT: movq 56(%rdi), %rdi
+; FALLBACK5-NEXT: movl (%rsi), %eax
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: sarq $63, %rdi
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: andl $56, %ecx
+; FALLBACK5-NEXT: andl $56, %eax
+; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq %r9, %rsi
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK5-NEXT: movq %r10, %r8
+; FALLBACK5-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK5-NEXT: movq %r11, %rbx
+; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK5-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK5-NEXT: movq %rax, %r15
+; FALLBACK5-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: sarq %cl, %r11
+; FALLBACK5-NEXT: movq %r15, 8(%rdx)
+; FALLBACK5-NEXT: movq %r9, 48(%rdx)
+; FALLBACK5-NEXT: movq %r11, 56(%rdx)
+; FALLBACK5-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK5-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK5-NEXT: movq %r8, 16(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r14, (%rdx)
+; FALLBACK5-NEXT: popq %rbx
+; FALLBACK5-NEXT: popq %r14
+; FALLBACK5-NEXT: popq %r15
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: ashr_64bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: pushq %rbp
+; FALLBACK6-NEXT: pushq %r15
+; FALLBACK6-NEXT: pushq %r14
+; FALLBACK6-NEXT: pushq %r13
+; FALLBACK6-NEXT: pushq %r12
+; FALLBACK6-NEXT: pushq %rbx
+; FALLBACK6-NEXT: pushq %rax
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT: movq 48(%rdi), %rcx
+; FALLBACK6-NEXT: movq 56(%rdi), %rdi
+; FALLBACK6-NEXT: movl (%rsi), %eax
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: sarq $63, %rdi
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: leal (,%rax,8), %esi
+; FALLBACK6-NEXT: andl $56, %esi
+; FALLBACK6-NEXT: andl $56, %eax
+; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx
+; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12
+; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9
+; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14
+; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15
+; FALLBACK6-NEXT: movl %esi, %ebx
+; FALLBACK6-NEXT: notb %bl
+; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK6-NEXT: orq %r11, %r8
+; FALLBACK6-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK6-NEXT: orq %r12, %r11
+; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13
+; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp
+; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK6-NEXT: sarxq %rsi, %rax, %rsi
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK6-NEXT: orq %r9, %rdi
+; FALLBACK6-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK6-NEXT: orq %r14, %r9
+; FALLBACK6-NEXT: addq %r10, %r10
+; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK6-NEXT: orq %r15, %r10
+; FALLBACK6-NEXT: addq %rax, %rax
+; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK6-NEXT: orq %r13, %rax
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx
+; FALLBACK6-NEXT: orq %rbp, %rcx
+; FALLBACK6-NEXT: movq %rsi, 56(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 8(%rdx)
+; FALLBACK6-NEXT: movq %rax, 48(%rdx)
+; FALLBACK6-NEXT: movq %r10, 32(%rdx)
+; FALLBACK6-NEXT: movq %r9, 40(%rdx)
+; FALLBACK6-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %r8, (%rdx)
+; FALLBACK6-NEXT: addq $8, %rsp
+; FALLBACK6-NEXT: popq %rbx
+; FALLBACK6-NEXT: popq %r12
+; FALLBACK6-NEXT: popq %r13
+; FALLBACK6-NEXT: popq %r14
+; FALLBACK6-NEXT: popq %r15
+; FALLBACK6-NEXT: popq %rbp
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: ashr_64bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: pushq %r15
+; FALLBACK7-NEXT: pushq %r14
+; FALLBACK7-NEXT: pushq %rbx
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT: movq 48(%rdi), %rcx
+; FALLBACK7-NEXT: movq 56(%rdi), %rdi
+; FALLBACK7-NEXT: movl (%rsi), %eax
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: sarq $63, %rdi
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: andl $56, %ecx
+; FALLBACK7-NEXT: andl $56, %eax
+; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq %r9, %rsi
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK7-NEXT: movq %r10, %r8
+; FALLBACK7-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK7-NEXT: movq %r11, %rbx
+; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK7-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK7-NEXT: movq %rax, %r15
+; FALLBACK7-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK7-NEXT: sarxq %rcx, %r11, %r10
+; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK7-NEXT: movq %r15, 8(%rdx)
+; FALLBACK7-NEXT: movq %r9, 48(%rdx)
+; FALLBACK7-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK7-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK7-NEXT: movq %r8, 16(%rdx)
+; FALLBACK7-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK7-NEXT: movq %r14, (%rdx)
+; FALLBACK7-NEXT: movq %r10, 56(%rdx)
+; FALLBACK7-NEXT: popq %rbx
+; FALLBACK7-NEXT: popq %r14
+; FALLBACK7-NEXT: popq %r15
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: ashr_64bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbp
+; FALLBACK8-NEXT: pushq %r15
+; FALLBACK8-NEXT: pushq %r14
+; FALLBACK8-NEXT: pushq %r13
+; FALLBACK8-NEXT: pushq %r12
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: pushq %rax
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK8-NEXT: movq 48(%rdi), %rax
+; FALLBACK8-NEXT: movq 56(%rdi), %rcx
+; FALLBACK8-NEXT: movl (%rsi), %edi
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: sarq $63, %rcx
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: leal (,%rdi,8), %eax
+; FALLBACK8-NEXT: andl $56, %eax
+; FALLBACK8-NEXT: andl $56, %edi
+; FALLBACK8-NEXT: movq -128(%rsp,%rdi), %r10
+; FALLBACK8-NEXT: movq -120(%rsp,%rdi), %r9
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: leaq (%r9,%r9), %r8
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r8
+; FALLBACK8-NEXT: orq %r10, %r8
+; FALLBACK8-NEXT: movq -104(%rsp,%rdi), %r10
+; FALLBACK8-NEXT: movq %r10, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbx
+; FALLBACK8-NEXT: movq -96(%rsp,%rdi), %r12
+; FALLBACK8-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r11
+; FALLBACK8-NEXT: orq %rbx, %r11
+; FALLBACK8-NEXT: movq -112(%rsp,%rdi), %rbx
+; FALLBACK8-NEXT: movq %rbx, %r14
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r14
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r14, %r10
+; FALLBACK8-NEXT: movq -88(%rsp,%rdi), %r14
+; FALLBACK8-NEXT: movq %r14, %r13
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r13
+; FALLBACK8-NEXT: movq -80(%rsp,%rdi), %rbp
+; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r15
+; FALLBACK8-NEXT: orq %r13, %r15
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r12
+; FALLBACK8-NEXT: addq %r14, %r14
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r14
+; FALLBACK8-NEXT: orq %r12, %r14
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbp
+; FALLBACK8-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK8-NEXT: leaq (%rdi,%rdi), %r12
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r12
+; FALLBACK8-NEXT: orq %rbp, %r12
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r9
+; FALLBACK8-NEXT: addq %rbx, %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r9, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: sarq %cl, %rdi
+; FALLBACK8-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK8-NEXT: movq %r12, 48(%rdx)
+; FALLBACK8-NEXT: movq %r14, 32(%rdx)
+; FALLBACK8-NEXT: movq %r15, 40(%rdx)
+; FALLBACK8-NEXT: movq %r10, 16(%rdx)
+; FALLBACK8-NEXT: movq %r11, 24(%rdx)
+; FALLBACK8-NEXT: movq %r8, (%rdx)
+; FALLBACK8-NEXT: addq $8, %rsp
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: popq %r12
+; FALLBACK8-NEXT: popq %r13
+; FALLBACK8-NEXT: popq %r14
+; FALLBACK8-NEXT: popq %r15
+; FALLBACK8-NEXT: popq %rbp
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: ashr_64bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: pushq %r15
+; FALLBACK9-NEXT: pushq %r14
+; FALLBACK9-NEXT: pushq %rbx
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK9-NEXT: movq 48(%rdi), %rcx
+; FALLBACK9-NEXT: movq 56(%rdi), %rdi
+; FALLBACK9-NEXT: movl (%rsi), %eax
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: sarq $63, %rdi
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: andl $56, %ecx
+; FALLBACK9-NEXT: andl $56, %eax
+; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq %r9, %rsi
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK9-NEXT: movq %r10, %r8
+; FALLBACK9-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK9-NEXT: movq %r11, %rbx
+; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK9-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK9-NEXT: movq %rax, %r15
+; FALLBACK9-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: sarq %cl, %r11
+; FALLBACK9-NEXT: movq %r15, 8(%rdx)
+; FALLBACK9-NEXT: movq %r9, 48(%rdx)
+; FALLBACK9-NEXT: movq %r11, 56(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK9-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK9-NEXT: movq %r8, 16(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r14, (%rdx)
+; FALLBACK9-NEXT: popq %rbx
+; FALLBACK9-NEXT: popq %r14
+; FALLBACK9-NEXT: popq %r15
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: ashr_64bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: pushq %rbp
+; FALLBACK10-NEXT: pushq %r15
+; FALLBACK10-NEXT: pushq %r14
+; FALLBACK10-NEXT: pushq %r13
+; FALLBACK10-NEXT: pushq %r12
+; FALLBACK10-NEXT: pushq %rbx
+; FALLBACK10-NEXT: pushq %rax
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK10-NEXT: movq 48(%rdi), %rcx
+; FALLBACK10-NEXT: movq 56(%rdi), %rdi
+; FALLBACK10-NEXT: movl (%rsi), %eax
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: sarq $63, %rdi
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: leal (,%rax,8), %esi
+; FALLBACK10-NEXT: andl $56, %esi
+; FALLBACK10-NEXT: andl $56, %eax
+; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx
+; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12
+; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9
+; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14
+; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15
+; FALLBACK10-NEXT: movl %esi, %ebx
+; FALLBACK10-NEXT: notb %bl
+; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK10-NEXT: orq %r11, %r8
+; FALLBACK10-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK10-NEXT: orq %r12, %r11
+; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13
+; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp
+; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK10-NEXT: sarxq %rsi, %rax, %rsi
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK10-NEXT: orq %r9, %rdi
+; FALLBACK10-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK10-NEXT: orq %r14, %r9
+; FALLBACK10-NEXT: addq %r10, %r10
+; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK10-NEXT: orq %r15, %r10
+; FALLBACK10-NEXT: addq %rax, %rax
+; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK10-NEXT: orq %r13, %rax
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx
+; FALLBACK10-NEXT: orq %rbp, %rcx
+; FALLBACK10-NEXT: movq %rsi, 56(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 8(%rdx)
+; FALLBACK10-NEXT: movq %rax, 48(%rdx)
+; FALLBACK10-NEXT: movq %r10, 32(%rdx)
+; FALLBACK10-NEXT: movq %r9, 40(%rdx)
+; FALLBACK10-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %r8, (%rdx)
+; FALLBACK10-NEXT: addq $8, %rsp
+; FALLBACK10-NEXT: popq %rbx
+; FALLBACK10-NEXT: popq %r12
+; FALLBACK10-NEXT: popq %r13
+; FALLBACK10-NEXT: popq %r14
+; FALLBACK10-NEXT: popq %r15
+; FALLBACK10-NEXT: popq %rbp
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: ashr_64bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: pushq %r15
+; FALLBACK11-NEXT: pushq %r14
+; FALLBACK11-NEXT: pushq %rbx
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK11-NEXT: movq 48(%rdi), %rcx
+; FALLBACK11-NEXT: movq 56(%rdi), %rdi
+; FALLBACK11-NEXT: movl (%rsi), %eax
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: sarq $63, %rdi
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: andl $56, %ecx
+; FALLBACK11-NEXT: andl $56, %eax
+; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq %r9, %rsi
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK11-NEXT: movq %r10, %r8
+; FALLBACK11-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK11-NEXT: movq %r11, %rbx
+; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK11-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK11-NEXT: movq %rax, %r15
+; FALLBACK11-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK11-NEXT: sarxq %rcx, %r11, %r10
+; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK11-NEXT: movq %r15, 8(%rdx)
+; FALLBACK11-NEXT: movq %r9, 48(%rdx)
+; FALLBACK11-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK11-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK11-NEXT: movq %r8, 16(%rdx)
+; FALLBACK11-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK11-NEXT: movq %r14, (%rdx)
+; FALLBACK11-NEXT: movq %r10, 56(%rdx)
+; FALLBACK11-NEXT: popq %rbx
+; FALLBACK11-NEXT: popq %r14
+; FALLBACK11-NEXT: popq %r15
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: ashr_64bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbp
+; FALLBACK12-NEXT: pushq %r15
+; FALLBACK12-NEXT: pushq %r14
+; FALLBACK12-NEXT: pushq %r13
+; FALLBACK12-NEXT: pushq %r12
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: pushq %rax
+; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK12-NEXT: movq 48(%rdi), %rax
+; FALLBACK12-NEXT: movq 56(%rdi), %rcx
+; FALLBACK12-NEXT: movl (%rsi), %edi
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: sarq $63, %rcx
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: leal (,%rdi,8), %eax
+; FALLBACK12-NEXT: andl $56, %eax
+; FALLBACK12-NEXT: andl $56, %edi
+; FALLBACK12-NEXT: movq -128(%rsp,%rdi), %r10
+; FALLBACK12-NEXT: movq -120(%rsp,%rdi), %r9
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: leaq (%r9,%r9), %r8
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r8
+; FALLBACK12-NEXT: orq %r10, %r8
+; FALLBACK12-NEXT: movq -104(%rsp,%rdi), %r10
+; FALLBACK12-NEXT: movq %r10, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbx
+; FALLBACK12-NEXT: movq -96(%rsp,%rdi), %r12
+; FALLBACK12-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r11
+; FALLBACK12-NEXT: orq %rbx, %r11
+; FALLBACK12-NEXT: movq -112(%rsp,%rdi), %rbx
+; FALLBACK12-NEXT: movq %rbx, %r14
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r14
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r14, %r10
+; FALLBACK12-NEXT: movq -88(%rsp,%rdi), %r14
+; FALLBACK12-NEXT: movq %r14, %r13
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r13
+; FALLBACK12-NEXT: movq -80(%rsp,%rdi), %rbp
+; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r15
+; FALLBACK12-NEXT: orq %r13, %r15
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r12
+; FALLBACK12-NEXT: addq %r14, %r14
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r14
+; FALLBACK12-NEXT: orq %r12, %r14
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbp
+; FALLBACK12-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK12-NEXT: leaq (%rdi,%rdi), %r12
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r12
+; FALLBACK12-NEXT: orq %rbp, %r12
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r9
+; FALLBACK12-NEXT: addq %rbx, %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r9, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: sarq %cl, %rdi
+; FALLBACK12-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK12-NEXT: movq %r12, 48(%rdx)
+; FALLBACK12-NEXT: movq %r14, 32(%rdx)
+; FALLBACK12-NEXT: movq %r15, 40(%rdx)
+; FALLBACK12-NEXT: movq %r10, 16(%rdx)
+; FALLBACK12-NEXT: movq %r11, 24(%rdx)
+; FALLBACK12-NEXT: movq %r8, (%rdx)
+; FALLBACK12-NEXT: addq $8, %rsp
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: popq %r12
+; FALLBACK12-NEXT: popq %r13
+; FALLBACK12-NEXT: popq %r14
+; FALLBACK12-NEXT: popq %r15
+; FALLBACK12-NEXT: popq %rbp
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: ashr_64bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: pushq %r15
+; FALLBACK13-NEXT: pushq %r14
+; FALLBACK13-NEXT: pushq %rbx
+; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK13-NEXT: movq 48(%rdi), %rcx
+; FALLBACK13-NEXT: movq 56(%rdi), %rdi
+; FALLBACK13-NEXT: movl (%rsi), %eax
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: sarq $63, %rdi
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: leal (,%rax,8), %ecx
+; FALLBACK13-NEXT: andl $56, %ecx
+; FALLBACK13-NEXT: andl $56, %eax
+; FALLBACK13-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK13-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK13-NEXT: movq %r9, %rsi
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK13-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK13-NEXT: movq %r10, %r8
+; FALLBACK13-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK13-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK13-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK13-NEXT: movq %r11, %rbx
+; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK13-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK13-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK13-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK13-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK13-NEXT: movq %rax, %r15
+; FALLBACK13-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK13-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: sarq %cl, %r11
+; FALLBACK13-NEXT: movq %r15, 8(%rdx)
+; FALLBACK13-NEXT: movq %r9, 48(%rdx)
+; FALLBACK13-NEXT: movq %r11, 56(%rdx)
+; FALLBACK13-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK13-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK13-NEXT: movq %r8, 16(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r14, (%rdx)
+; FALLBACK13-NEXT: popq %rbx
+; FALLBACK13-NEXT: popq %r14
+; FALLBACK13-NEXT: popq %r15
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: ashr_64bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: pushq %rbp
+; FALLBACK14-NEXT: pushq %r15
+; FALLBACK14-NEXT: pushq %r14
+; FALLBACK14-NEXT: pushq %r13
+; FALLBACK14-NEXT: pushq %r12
+; FALLBACK14-NEXT: pushq %rbx
+; FALLBACK14-NEXT: pushq %rax
+; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK14-NEXT: movq 48(%rdi), %rcx
+; FALLBACK14-NEXT: movq 56(%rdi), %rdi
+; FALLBACK14-NEXT: movl (%rsi), %eax
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: sarq $63, %rdi
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: leal (,%rax,8), %esi
+; FALLBACK14-NEXT: andl $56, %esi
+; FALLBACK14-NEXT: andl $56, %eax
+; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK14-NEXT: movq -112(%rsp,%rax), %rcx
+; FALLBACK14-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK14-NEXT: shrxq %rsi, %rdi, %r12
+; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK14-NEXT: shrxq %rsi, %rcx, %r9
+; FALLBACK14-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK14-NEXT: shrxq %rsi, %r10, %r14
+; FALLBACK14-NEXT: shrxq %rsi, %r13, %r15
+; FALLBACK14-NEXT: movl %esi, %ebx
+; FALLBACK14-NEXT: notb %bl
+; FALLBACK14-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK14-NEXT: orq %r11, %r8
+; FALLBACK14-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK14-NEXT: orq %r12, %r11
+; FALLBACK14-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK14-NEXT: shrxq %rsi, %r12, %r13
+; FALLBACK14-NEXT: shrxq %rsi, %rbp, %rbp
+; FALLBACK14-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK14-NEXT: sarxq %rsi, %rax, %rsi
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK14-NEXT: orq %r9, %rdi
+; FALLBACK14-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK14-NEXT: orq %r14, %r9
+; FALLBACK14-NEXT: addq %r10, %r10
+; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK14-NEXT: orq %r15, %r10
+; FALLBACK14-NEXT: addq %rax, %rax
+; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK14-NEXT: orq %r13, %rax
+; FALLBACK14-NEXT: addq %rcx, %rcx
+; FALLBACK14-NEXT: shlxq %rbx, %rcx, %rcx
+; FALLBACK14-NEXT: orq %rbp, %rcx
+; FALLBACK14-NEXT: movq %rsi, 56(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 8(%rdx)
+; FALLBACK14-NEXT: movq %rax, 48(%rdx)
+; FALLBACK14-NEXT: movq %r10, 32(%rdx)
+; FALLBACK14-NEXT: movq %r9, 40(%rdx)
+; FALLBACK14-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %r8, (%rdx)
+; FALLBACK14-NEXT: addq $8, %rsp
+; FALLBACK14-NEXT: popq %rbx
+; FALLBACK14-NEXT: popq %r12
+; FALLBACK14-NEXT: popq %r13
+; FALLBACK14-NEXT: popq %r14
+; FALLBACK14-NEXT: popq %r15
+; FALLBACK14-NEXT: popq %rbp
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: ashr_64bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: pushq %r15
+; FALLBACK15-NEXT: pushq %r14
+; FALLBACK15-NEXT: pushq %rbx
+; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK15-NEXT: movq 48(%rdi), %rcx
+; FALLBACK15-NEXT: movq 56(%rdi), %rdi
+; FALLBACK15-NEXT: movl (%rsi), %eax
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: sarq $63, %rdi
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: andl $56, %ecx
+; FALLBACK15-NEXT: andl $56, %eax
+; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq %r9, %rsi
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK15-NEXT: movq %r10, %r8
+; FALLBACK15-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK15-NEXT: movq %r11, %rbx
+; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK15-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK15-NEXT: movq %rax, %r15
+; FALLBACK15-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK15-NEXT: sarxq %rcx, %r11, %r10
+; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK15-NEXT: movq %r15, 8(%rdx)
+; FALLBACK15-NEXT: movq %r9, 48(%rdx)
+; FALLBACK15-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK15-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK15-NEXT: movq %r8, 16(%rdx)
+; FALLBACK15-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK15-NEXT: movq %r14, (%rdx)
+; FALLBACK15-NEXT: movq %r10, 56(%rdx)
+; FALLBACK15-NEXT: popq %rbx
+; FALLBACK15-NEXT: popq %r14
+; FALLBACK15-NEXT: popq %r15
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: ashr_64bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $204, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl (%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 16(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 20(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 24(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 28(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 36(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 40(%ecx), %ebx
+; FALLBACK16-NEXT: movl 44(%ecx), %edi
+; FALLBACK16-NEXT: movl 48(%ecx), %esi
+; FALLBACK16-NEXT: movl 52(%ecx), %edx
+; FALLBACK16-NEXT: movl 56(%ecx), %eax
+; FALLBACK16-NEXT: movl 60(%ecx), %ecx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK16-NEXT: movl (%ebp), %ebp
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: sarl $31, %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, %ecx
+; FALLBACK16-NEXT: movl %ebp, %esi
+; FALLBACK16-NEXT: andl $60, %esi
+; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK16-NEXT: shll $3, %ecx
+; FALLBACK16-NEXT: andl $24, %ecx
+; FALLBACK16-NEXT: movl %edx, %eax
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 72(%esp,%esi), %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: addl %edi, %edi
+; FALLBACK16-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT: movl %ecx, %ebx
+; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT: notb %ch
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %eax, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 64(%esp,%esi), %eax
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %edx, %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %eax, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 76(%esp,%esi), %ebp
+; FALLBACK16-NEXT: movl %ebp, %edx
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %edx, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %ebp, %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %esi, %edx
+; FALLBACK16-NEXT: movl 84(%esp,%esi), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 88(%esp,%esi), %esi
+; FALLBACK16-NEXT: leal (%esi,%esi), %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %edx, %eax
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 92(%esp,%edx), %ebp
+; FALLBACK16-NEXT: movl %ebp, %edx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: movl 96(%esp,%eax), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %edx, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: addl %ebp, %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %esi, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl 100(%esp,%edx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 104(%esp,%edx), %esi
+; FALLBACK16-NEXT: leal (%esi,%esi), %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %ebx, %edx
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK16-NEXT: movl 108(%esp,%ebp), %edi
+; FALLBACK16-NEXT: movl %edi, %eax
+; FALLBACK16-NEXT: movl %edx, %ebx
+; FALLBACK16-NEXT: movl %ebx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 112(%esp,%ebp), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %ebp, %edx
+; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebp
+; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: addl %edi, %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %esi, %edi
+; FALLBACK16-NEXT: movl 116(%esp,%edx), %esi
+; FALLBACK16-NEXT: movl %esi, %eax
+; FALLBACK16-NEXT: movl %ebx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 120(%esp,%edx), %edx
+; FALLBACK16-NEXT: leal (%edx,%edx), %ebp
+; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %esi, %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %eax, %esi
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: movl %edx, %eax
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %eax, %edx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT: sarl %cl, %ebx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl %ebx, 60(%eax)
+; FALLBACK16-NEXT: movl %edx, 56(%eax)
+; FALLBACK16-NEXT: movl %esi, 48(%eax)
+; FALLBACK16-NEXT: movl %ebp, 52(%eax)
+; FALLBACK16-NEXT: movl %edi, 40(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 44(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 32(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 36(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 24(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 28(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 16(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 20(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 8(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 12(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, (%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 4(%eax)
+; FALLBACK16-NEXT: addl $204, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: ashr_64bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $188, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl (%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 12(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 20(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 24(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 28(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 36(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%eax), %ebp
+; FALLBACK17-NEXT: movl 44(%eax), %ebx
+; FALLBACK17-NEXT: movl 48(%eax), %edi
+; FALLBACK17-NEXT: movl 52(%eax), %esi
+; FALLBACK17-NEXT: movl 56(%eax), %edx
+; FALLBACK17-NEXT: movl 60(%eax), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %ecx
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: sarl $31, %eax
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ecx, %ebp
+; FALLBACK17-NEXT: andl $60, %ebp
+; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shll $3, %ecx
+; FALLBACK17-NEXT: andl $24, %ecx
+; FALLBACK17-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %esi
+; FALLBACK17-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl %esi, %edx
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl %edx, 56(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT: sarl %cl, %eax
+; FALLBACK17-NEXT: movl %eax, 60(%ebp)
+; FALLBACK17-NEXT: movl %esi, 48(%ebp)
+; FALLBACK17-NEXT: movl %edi, 52(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 40(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 44(%ebp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 32(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 36(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 24(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 28(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 16(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 20(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 8(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 12(%ebp)
+; FALLBACK17-NEXT: movl %ebx, (%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 4(%ebp)
+; FALLBACK17-NEXT: addl $188, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: ashr_64bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $204, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 12(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 16(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 20(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 24(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 28(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 32(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 36(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 40(%eax), %ebp
+; FALLBACK18-NEXT: movl 44(%eax), %ebx
+; FALLBACK18-NEXT: movl 48(%eax), %edi
+; FALLBACK18-NEXT: movl 52(%eax), %esi
+; FALLBACK18-NEXT: movl 56(%eax), %edx
+; FALLBACK18-NEXT: movl 60(%eax), %ecx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %eax
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: sarl $31, %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %eax, %ecx
+; FALLBACK18-NEXT: leal (,%eax,8), %edx
+; FALLBACK18-NEXT: andl $24, %edx
+; FALLBACK18-NEXT: andl $60, %ecx
+; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK18-NEXT: movl 72(%esp,%ecx), %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl %edx, %ebx
+; FALLBACK18-NEXT: notb %bl
+; FALLBACK18-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: orl %edi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK18-NEXT: leal (%esi,%esi), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK18-NEXT: leal (%esi,%esi), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK18-NEXT: movl %ecx, %edi
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %ecx, %esi
+; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK18-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK18-NEXT: orl %edi, %ecx
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: addl %eax, %eax
+; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK18-NEXT: sarxl %edx, %ebp, %edx
+; FALLBACK18-NEXT: addl %ebp, %ebp
+; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK18-NEXT: orl %eax, %ebx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl %edx, 60(%eax)
+; FALLBACK18-NEXT: movl %ebx, 56(%eax)
+; FALLBACK18-NEXT: movl %edi, 48(%eax)
+; FALLBACK18-NEXT: movl %ecx, 52(%eax)
+; FALLBACK18-NEXT: movl %esi, 40(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 44(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 32(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 36(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 24(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 28(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 16(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, (%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $204, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: ashr_64bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $188, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: movl (%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 12(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 20(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 24(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 28(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 36(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%eax), %ebp
+; FALLBACK19-NEXT: movl 44(%eax), %ebx
+; FALLBACK19-NEXT: movl 48(%eax), %edi
+; FALLBACK19-NEXT: movl 52(%eax), %esi
+; FALLBACK19-NEXT: movl 56(%eax), %edx
+; FALLBACK19-NEXT: movl 60(%eax), %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %ecx
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: sarl $31, %eax
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ecx, %ebp
+; FALLBACK19-NEXT: andl $60, %ebp
+; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shll $3, %ecx
+; FALLBACK19-NEXT: andl $24, %ecx
+; FALLBACK19-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %esi
+; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl %edi, %edx
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl %eax, 56(%ebp)
+; FALLBACK19-NEXT: movl %esi, 48(%ebp)
+; FALLBACK19-NEXT: movl %edx, 52(%ebp)
+; FALLBACK19-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 44(%ebp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 32(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 36(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 24(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 28(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 16(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 20(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 8(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 12(%ebp)
+; FALLBACK19-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, (%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK19-NEXT: movl %eax, 60(%ebp)
+; FALLBACK19-NEXT: addl $188, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: ashr_64bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $204, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT: movl 48(%ecx), %edx
+; FALLBACK20-NEXT: movl 52(%ecx), %esi
+; FALLBACK20-NEXT: movl 56(%ecx), %edi
+; FALLBACK20-NEXT: movl 60(%ecx), %ecx
+; FALLBACK20-NEXT: movl (%eax), %eax
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: sarl $31, %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %eax, %esi
+; FALLBACK20-NEXT: andl $60, %esi
+; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK20-NEXT: shll $3, %eax
+; FALLBACK20-NEXT: andl $24, %eax
+; FALLBACK20-NEXT: movl %edx, %edi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movb %al, %ch
+; FALLBACK20-NEXT: notb %ch
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %edx, %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %edi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK20-NEXT: movl %edx, %ebp
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %edx, %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: addl %eax, %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %eax, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK20-NEXT: leal (%edx,%edx), %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK20-NEXT: movl %edi, %ebp
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: addl %edi, %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl %edx, %edi
+; FALLBACK20-NEXT: movl %esi, %edx
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK20-NEXT: movl %esi, %ebx
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK20-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %eax, %edx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK20-NEXT: sarl %cl, %ebx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %ebx, 60(%eax)
+; FALLBACK20-NEXT: movl %edx, 56(%eax)
+; FALLBACK20-NEXT: movl %esi, 48(%eax)
+; FALLBACK20-NEXT: movl %ebp, 52(%eax)
+; FALLBACK20-NEXT: movl %edi, 40(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 44(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 32(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 36(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 24(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 28(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 16(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, (%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 4(%eax)
+; FALLBACK20-NEXT: addl $204, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: ashr_64bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $188, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movups (%eax), %xmm0
+; FALLBACK21-NEXT: movups 16(%eax), %xmm1
+; FALLBACK21-NEXT: movups 32(%eax), %xmm2
+; FALLBACK21-NEXT: movl 48(%eax), %edx
+; FALLBACK21-NEXT: movl 52(%eax), %esi
+; FALLBACK21-NEXT: movl 56(%eax), %edi
+; FALLBACK21-NEXT: movl 60(%eax), %eax
+; FALLBACK21-NEXT: movl (%ecx), %ecx
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: sarl $31, %eax
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %ecx, %ebp
+; FALLBACK21-NEXT: andl $60, %ebp
+; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shll $3, %ecx
+; FALLBACK21-NEXT: andl $24, %ecx
+; FALLBACK21-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %esi
+; FALLBACK21-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl %esi, %edx
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edi
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edi
+; FALLBACK21-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %edx, 56(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT: sarl %cl, %eax
+; FALLBACK21-NEXT: movl %eax, 60(%ebp)
+; FALLBACK21-NEXT: movl %esi, 48(%ebp)
+; FALLBACK21-NEXT: movl %edi, 52(%ebp)
+; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 40(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 44(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 32(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 36(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 24(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %ebx, (%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 4(%ebp)
+; FALLBACK21-NEXT: addl $188, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: ashr_64bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $204, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT: movl 48(%ecx), %edx
+; FALLBACK22-NEXT: movl 52(%ecx), %esi
+; FALLBACK22-NEXT: movl 56(%ecx), %edi
+; FALLBACK22-NEXT: movl 60(%ecx), %ecx
+; FALLBACK22-NEXT: movl (%eax), %eax
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: sarl $31, %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %eax, %ecx
+; FALLBACK22-NEXT: leal (,%eax,8), %edx
+; FALLBACK22-NEXT: andl $24, %edx
+; FALLBACK22-NEXT: andl $60, %ecx
+; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl 72(%esp,%ecx), %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl %edx, %ebx
+; FALLBACK22-NEXT: notb %bl
+; FALLBACK22-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK22-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT: orl %edi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT: orl %eax, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT: orl %eax, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl %ecx, %edi
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK22-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK22-NEXT: orl %edi, %ecx
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %eax, %eax
+; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK22-NEXT: sarxl %edx, %ebp, %edx
+; FALLBACK22-NEXT: addl %ebp, %ebp
+; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK22-NEXT: orl %eax, %ebx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl %edx, 60(%eax)
+; FALLBACK22-NEXT: movl %ebx, 56(%eax)
+; FALLBACK22-NEXT: movl %edi, 48(%eax)
+; FALLBACK22-NEXT: movl %ecx, 52(%eax)
+; FALLBACK22-NEXT: movl %esi, 40(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 44(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 32(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 36(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 24(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 28(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 16(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 20(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 8(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 12(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, (%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 4(%eax)
+; FALLBACK22-NEXT: addl $204, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: ashr_64bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $188, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movups (%eax), %xmm0
+; FALLBACK23-NEXT: movups 16(%eax), %xmm1
+; FALLBACK23-NEXT: movups 32(%eax), %xmm2
+; FALLBACK23-NEXT: movl 48(%eax), %edx
+; FALLBACK23-NEXT: movl 52(%eax), %esi
+; FALLBACK23-NEXT: movl 56(%eax), %edi
+; FALLBACK23-NEXT: movl 60(%eax), %eax
+; FALLBACK23-NEXT: movl (%ecx), %ecx
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: sarl $31, %eax
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %ecx, %ebp
+; FALLBACK23-NEXT: andl $60, %ebp
+; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shll $3, %ecx
+; FALLBACK23-NEXT: andl $24, %ecx
+; FALLBACK23-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %esi
+; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl %edi, %edx
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT: movl %eax, 56(%ebp)
+; FALLBACK23-NEXT: movl %esi, 48(%ebp)
+; FALLBACK23-NEXT: movl %edx, 52(%ebp)
+; FALLBACK23-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 44(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 32(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 36(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 24(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 28(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 16(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 20(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 8(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 12(%ebp)
+; FALLBACK23-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, (%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK23-NEXT: movl %eax, 60(%ebp)
+; FALLBACK23-NEXT: addl $188, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: ashr_64bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $204, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: vmovups 32(%ecx), %xmm1
+; FALLBACK24-NEXT: movl 48(%ecx), %edx
+; FALLBACK24-NEXT: movl 52(%ecx), %esi
+; FALLBACK24-NEXT: movl 56(%ecx), %edi
+; FALLBACK24-NEXT: movl 60(%ecx), %ecx
+; FALLBACK24-NEXT: movl (%eax), %eax
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: sarl $31, %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %eax, %esi
+; FALLBACK24-NEXT: andl $60, %esi
+; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK24-NEXT: shll $3, %eax
+; FALLBACK24-NEXT: andl $24, %eax
+; FALLBACK24-NEXT: movl %edx, %edi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT: movb %al, %ch
+; FALLBACK24-NEXT: notb %ch
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %edx, %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %edi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK24-NEXT: movl %edx, %ebp
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %edx, %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: addl %eax, %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %eax, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK24-NEXT: leal (%edx,%edx), %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK24-NEXT: movl %edi, %ebp
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: addl %edi, %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl %edx, %edi
+; FALLBACK24-NEXT: movl %esi, %edx
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK24-NEXT: movl %esi, %ebx
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK24-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %eax, %edx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK24-NEXT: sarl %cl, %ebx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %ebx, 60(%eax)
+; FALLBACK24-NEXT: movl %edx, 56(%eax)
+; FALLBACK24-NEXT: movl %esi, 48(%eax)
+; FALLBACK24-NEXT: movl %ebp, 52(%eax)
+; FALLBACK24-NEXT: movl %edi, 40(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 44(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 32(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 36(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 24(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 28(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 16(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, (%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 4(%eax)
+; FALLBACK24-NEXT: addl $204, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: ashr_64bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $188, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: vmovups (%eax), %ymm0
+; FALLBACK25-NEXT: vmovups 32(%eax), %xmm1
+; FALLBACK25-NEXT: movl 48(%eax), %edx
+; FALLBACK25-NEXT: movl 52(%eax), %esi
+; FALLBACK25-NEXT: movl 56(%eax), %edi
+; FALLBACK25-NEXT: movl 60(%eax), %eax
+; FALLBACK25-NEXT: movl (%ecx), %ecx
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: sarl $31, %eax
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %ecx, %ebp
+; FALLBACK25-NEXT: andl $60, %ebp
+; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shll $3, %ecx
+; FALLBACK25-NEXT: andl $24, %ecx
+; FALLBACK25-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %esi
+; FALLBACK25-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl %esi, %edx
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edi
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edi
+; FALLBACK25-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %edx, 56(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT: sarl %cl, %eax
+; FALLBACK25-NEXT: movl %eax, 60(%ebp)
+; FALLBACK25-NEXT: movl %esi, 48(%ebp)
+; FALLBACK25-NEXT: movl %edi, 52(%ebp)
+; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 40(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 44(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 32(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 36(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 24(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %ebx, (%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 4(%ebp)
+; FALLBACK25-NEXT: addl $188, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: ashr_64bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $204, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: vmovups 32(%ecx), %xmm1
+; FALLBACK26-NEXT: movl 48(%ecx), %edx
+; FALLBACK26-NEXT: movl 52(%ecx), %esi
+; FALLBACK26-NEXT: movl 56(%ecx), %edi
+; FALLBACK26-NEXT: movl 60(%ecx), %ecx
+; FALLBACK26-NEXT: movl (%eax), %eax
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: sarl $31, %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %eax, %ecx
+; FALLBACK26-NEXT: leal (,%eax,8), %edx
+; FALLBACK26-NEXT: andl $24, %edx
+; FALLBACK26-NEXT: andl $60, %ecx
+; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl 72(%esp,%ecx), %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl %edx, %ebx
+; FALLBACK26-NEXT: notb %bl
+; FALLBACK26-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK26-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT: orl %edi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT: orl %eax, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT: orl %eax, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl %ecx, %edi
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %ecx, %esi
+; FALLBACK26-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK26-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK26-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK26-NEXT: orl %edi, %ecx
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %eax, %eax
+; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK26-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK26-NEXT: sarxl %edx, %ebp, %edx
+; FALLBACK26-NEXT: addl %ebp, %ebp
+; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK26-NEXT: orl %eax, %ebx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl %edx, 60(%eax)
+; FALLBACK26-NEXT: movl %ebx, 56(%eax)
+; FALLBACK26-NEXT: movl %edi, 48(%eax)
+; FALLBACK26-NEXT: movl %ecx, 52(%eax)
+; FALLBACK26-NEXT: movl %esi, 40(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 44(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 32(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 36(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 24(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 28(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 16(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 20(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 8(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 12(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, (%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 4(%eax)
+; FALLBACK26-NEXT: addl $204, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: ashr_64bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $188, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: vmovups (%eax), %ymm0
+; FALLBACK27-NEXT: vmovups 32(%eax), %xmm1
+; FALLBACK27-NEXT: movl 48(%eax), %edx
+; FALLBACK27-NEXT: movl 52(%eax), %esi
+; FALLBACK27-NEXT: movl 56(%eax), %edi
+; FALLBACK27-NEXT: movl 60(%eax), %eax
+; FALLBACK27-NEXT: movl (%ecx), %ecx
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: sarl $31, %eax
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %ecx, %ebp
+; FALLBACK27-NEXT: andl $60, %ebp
+; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shll $3, %ecx
+; FALLBACK27-NEXT: andl $24, %ecx
+; FALLBACK27-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %esi
+; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl %edi, %edx
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT: movl %eax, 56(%ebp)
+; FALLBACK27-NEXT: movl %esi, 48(%ebp)
+; FALLBACK27-NEXT: movl %edx, 52(%ebp)
+; FALLBACK27-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 44(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 32(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 36(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 24(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 28(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 16(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 20(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 8(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 12(%ebp)
+; FALLBACK27-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, (%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK27-NEXT: movl %eax, 60(%ebp)
+; FALLBACK27-NEXT: addl $188, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: ashr_64bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $204, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT: vmovups 32(%ecx), %xmm1
+; FALLBACK28-NEXT: movl 48(%ecx), %edx
+; FALLBACK28-NEXT: movl 52(%ecx), %esi
+; FALLBACK28-NEXT: movl 56(%ecx), %edi
+; FALLBACK28-NEXT: movl 60(%ecx), %ecx
+; FALLBACK28-NEXT: movl (%eax), %eax
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: sarl $31, %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %eax, %esi
+; FALLBACK28-NEXT: andl $60, %esi
+; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK28-NEXT: shll $3, %eax
+; FALLBACK28-NEXT: andl $24, %eax
+; FALLBACK28-NEXT: movl %edx, %edi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT: movb %al, %ch
+; FALLBACK28-NEXT: notb %ch
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %edx, %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %edi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK28-NEXT: movl %edx, %ebp
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %edx, %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: addl %eax, %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %eax, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK28-NEXT: leal (%edx,%edx), %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK28-NEXT: movl %edi, %ebp
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: addl %edi, %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl %edx, %edi
+; FALLBACK28-NEXT: movl %esi, %edx
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK28-NEXT: movl %esi, %ebx
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK28-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %eax, %edx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK28-NEXT: sarl %cl, %ebx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %ebx, 60(%eax)
+; FALLBACK28-NEXT: movl %edx, 56(%eax)
+; FALLBACK28-NEXT: movl %esi, 48(%eax)
+; FALLBACK28-NEXT: movl %ebp, 52(%eax)
+; FALLBACK28-NEXT: movl %edi, 40(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 44(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 32(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 36(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 24(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 28(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 16(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, (%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 4(%eax)
+; FALLBACK28-NEXT: addl $204, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: ashr_64bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $188, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: vmovups (%eax), %ymm0
+; FALLBACK29-NEXT: vmovups 32(%eax), %xmm1
+; FALLBACK29-NEXT: movl 48(%eax), %edx
+; FALLBACK29-NEXT: movl 52(%eax), %esi
+; FALLBACK29-NEXT: movl 56(%eax), %edi
+; FALLBACK29-NEXT: movl 60(%eax), %eax
+; FALLBACK29-NEXT: movl (%ecx), %ecx
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: sarl $31, %eax
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %ecx, %ebp
+; FALLBACK29-NEXT: andl $60, %ebp
+; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shll $3, %ecx
+; FALLBACK29-NEXT: andl $24, %ecx
+; FALLBACK29-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %esi
+; FALLBACK29-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl %esi, %edx
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edi
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edi
+; FALLBACK29-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %edx, 56(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT: sarl %cl, %eax
+; FALLBACK29-NEXT: movl %eax, 60(%ebp)
+; FALLBACK29-NEXT: movl %esi, 48(%ebp)
+; FALLBACK29-NEXT: movl %edi, 52(%ebp)
+; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 40(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 44(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 32(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 36(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 24(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %ebx, (%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 4(%ebp)
+; FALLBACK29-NEXT: addl $188, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: ashr_64bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $204, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT: vmovups 32(%ecx), %xmm1
+; FALLBACK30-NEXT: movl 48(%ecx), %edx
+; FALLBACK30-NEXT: movl 52(%ecx), %esi
+; FALLBACK30-NEXT: movl 56(%ecx), %edi
+; FALLBACK30-NEXT: movl 60(%ecx), %ecx
+; FALLBACK30-NEXT: movl (%eax), %eax
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: sarl $31, %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %eax, %ecx
+; FALLBACK30-NEXT: leal (,%eax,8), %edx
+; FALLBACK30-NEXT: andl $24, %edx
+; FALLBACK30-NEXT: andl $60, %ecx
+; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK30-NEXT: movl 72(%esp,%ecx), %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl %edx, %ebx
+; FALLBACK30-NEXT: notb %bl
+; FALLBACK30-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK30-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT: orl %edi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT: orl %eax, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT: orl %eax, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK30-NEXT: movl %ecx, %edi
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %ecx, %esi
+; FALLBACK30-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK30-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK30-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK30-NEXT: orl %edi, %ecx
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %eax, %eax
+; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK30-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK30-NEXT: sarxl %edx, %ebp, %edx
+; FALLBACK30-NEXT: addl %ebp, %ebp
+; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK30-NEXT: orl %eax, %ebx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl %edx, 60(%eax)
+; FALLBACK30-NEXT: movl %ebx, 56(%eax)
+; FALLBACK30-NEXT: movl %edi, 48(%eax)
+; FALLBACK30-NEXT: movl %ecx, 52(%eax)
+; FALLBACK30-NEXT: movl %esi, 40(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 44(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 32(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 36(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 24(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 28(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 16(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 20(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 8(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 12(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, (%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 4(%eax)
+; FALLBACK30-NEXT: addl $204, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: ashr_64bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $188, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: vmovups (%eax), %ymm0
+; FALLBACK31-NEXT: vmovups 32(%eax), %xmm1
+; FALLBACK31-NEXT: movl 48(%eax), %edx
+; FALLBACK31-NEXT: movl 52(%eax), %esi
+; FALLBACK31-NEXT: movl 56(%eax), %edi
+; FALLBACK31-NEXT: movl 60(%eax), %eax
+; FALLBACK31-NEXT: movl (%ecx), %ecx
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: sarl $31, %eax
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %ecx, %ebp
+; FALLBACK31-NEXT: andl $60, %ebp
+; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shll $3, %ecx
+; FALLBACK31-NEXT: andl $24, %ecx
+; FALLBACK31-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %esi
+; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl %edi, %edx
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT: movl %eax, 56(%ebp)
+; FALLBACK31-NEXT: movl %esi, 48(%ebp)
+; FALLBACK31-NEXT: movl %edx, 52(%ebp)
+; FALLBACK31-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 44(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 32(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 36(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 24(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 28(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 16(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 20(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 8(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 12(%ebp)
+; FALLBACK31-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, (%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK31-NEXT: movl %eax, 60(%ebp)
+; FALLBACK31-NEXT: addl $188, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
+ %src = load i512, ptr %src.ptr, align 1
+ %byteOff = load i512, ptr %byteOff.ptr, align 1
+ %bitOff = shl i512 %byteOff, 3
+ %res = ashr i512 %src, %bitOff
+ store i512 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: ashr_64bytes_qwordOff:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pushq %rbx
; X64-SSE2-NEXT: movq (%rdi), %rax
@@ -2394,15 +24296,15 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $63, %esi
-; X64-SSE2-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-SSE2-NEXT: movq -120(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %r8
-; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r9
-; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r10
-; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %r11
-; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %rsi
+; X64-SSE2-NEXT: andl $7, %esi
+; X64-SSE2-NEXT: movq -128(%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT: movq -120(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT: movq -104(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT: movq -112(%rsp,%rsi,8), %r8
+; X64-SSE2-NEXT: movq -88(%rsp,%rsi,8), %r9
+; X64-SSE2-NEXT: movq -96(%rsp,%rsi,8), %r10
+; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %r11
+; X64-SSE2-NEXT: movq -80(%rsp,%rsi,8), %rsi
; X64-SSE2-NEXT: movq %rsi, 48(%rdx)
; X64-SSE2-NEXT: movq %r11, 56(%rdx)
; X64-SSE2-NEXT: movq %r10, 32(%rdx)
@@ -2414,8 +24316,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: popq %rbx
; X64-SSE2-NEXT: retq
;
-; X64-SSE42-LABEL: ashr_64bytes:
+; X64-SSE42-LABEL: ashr_64bytes_qwordOff:
; X64-SSE42: # %bb.0:
+; X64-SSE42-NEXT: pushq %rax
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
@@ -2424,9 +24327,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE42-NEXT: movl (%rsi), %esi
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: sarq $63, %rcx
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
@@ -2436,19 +24339,21 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $63, %esi
-; X64-SSE42-NEXT: movups -128(%rsp,%rsi), %xmm0
-; X64-SSE42-NEXT: movups -112(%rsp,%rsi), %xmm1
-; X64-SSE42-NEXT: movups -96(%rsp,%rsi), %xmm2
-; X64-SSE42-NEXT: movups -80(%rsp,%rsi), %xmm3
+; X64-SSE42-NEXT: andl $7, %esi
+; X64-SSE42-NEXT: movups -128(%rsp,%rsi,8), %xmm0
+; X64-SSE42-NEXT: movups -112(%rsp,%rsi,8), %xmm1
+; X64-SSE42-NEXT: movups -96(%rsp,%rsi,8), %xmm2
+; X64-SSE42-NEXT: movups -80(%rsp,%rsi,8), %xmm3
; X64-SSE42-NEXT: movups %xmm3, 48(%rdx)
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
+; X64-SSE42-NEXT: popq %rax
; X64-SSE42-NEXT: retq
;
-; X64-AVX-LABEL: ashr_64bytes:
+; X64-AVX-LABEL: ashr_64bytes_qwordOff:
; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: pushq %rax
; X64-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-AVX-NEXT: vmovups 32(%rdi), %xmm1
; X64-AVX-NEXT: movq 48(%rdi), %rax
@@ -2456,7 +24361,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-AVX-NEXT: movl (%rsi), %esi
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: sarq $63, %rcx
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
@@ -2467,25 +24372,26 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: andl $63, %esi
-; X64-AVX-NEXT: vmovups -128(%rsp,%rsi), %xmm0
-; X64-AVX-NEXT: vmovups -112(%rsp,%rsi), %xmm1
-; X64-AVX-NEXT: vmovups -96(%rsp,%rsi), %xmm2
-; X64-AVX-NEXT: vmovups -80(%rsp,%rsi), %xmm3
+; X64-AVX-NEXT: andl $7, %esi
+; X64-AVX-NEXT: vmovups -128(%rsp,%rsi,8), %xmm0
+; X64-AVX-NEXT: vmovups -112(%rsp,%rsi,8), %xmm1
+; X64-AVX-NEXT: vmovups -96(%rsp,%rsi,8), %xmm2
+; X64-AVX-NEXT: vmovups -80(%rsp,%rsi,8), %xmm3
; X64-AVX-NEXT: vmovups %xmm3, 48(%rdx)
; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT: popq %rax
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
;
-; X86-SSE2-LABEL: ashr_64bytes:
+; X86-SSE2-LABEL: ashr_64bytes_qwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $168, %esp
+; X86-SSE2-NEXT: subl $188, %esp
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl (%eax), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -2506,7 +24412,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl 32(%eax), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 36(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 40(%eax), %ebp
; X86-SSE2-NEXT: movl 44(%eax), %ebx
; X86-SSE2-NEXT: movl 48(%eax), %edi
@@ -2520,7 +24426,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -2558,33 +24464,33 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $63, %eax
-; X86-SSE2-NEXT: movl 40(%esp,%eax), %ecx
+; X86-SSE2-NEXT: andl $7, %eax
+; X86-SSE2-NEXT: movl 48(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 52(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 60(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 56(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 48(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 68(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 64(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 56(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 76(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 72(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 84(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 80(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 72(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebp
-; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebx
-; X86-SSE2-NEXT: movl 92(%esp,%eax), %edi
-; X86-SSE2-NEXT: movl 88(%esp,%eax), %esi
-; X86-SSE2-NEXT: movl 100(%esp,%eax), %edx
-; X86-SSE2-NEXT: movl 96(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 92(%esp,%eax,8), %ebp
+; X86-SSE2-NEXT: movl 88(%esp,%eax,8), %ebx
+; X86-SSE2-NEXT: movl 100(%esp,%eax,8), %edi
+; X86-SSE2-NEXT: movl 96(%esp,%eax,8), %esi
+; X86-SSE2-NEXT: movl 108(%esp,%eax,8), %edx
+; X86-SSE2-NEXT: movl 104(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl %ecx, 56(%eax)
; X86-SSE2-NEXT: movl %edx, 60(%eax)
@@ -2592,7 +24498,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %edi, 52(%eax)
; X86-SSE2-NEXT: movl %ebx, 40(%eax)
; X86-SSE2-NEXT: movl %ebp, 44(%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 32(%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 36(%eax)
@@ -2612,14 +24518,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ecx, (%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $168, %esp
+; X86-SSE2-NEXT: addl $188, %esp
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: ashr_64bytes:
+; X86-SSE42-LABEL: ashr_64bytes_qwordOff:
; X86-SSE42: # %bb.0:
; X86-SSE42-NEXT: pushl %ebx
; X86-SSE42-NEXT: pushl %edi
@@ -2640,9 +24546,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, (%esp)
; X86-SSE42-NEXT: sarl $31, %edx
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -2660,11 +24566,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andl $63, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm2
-; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm3
+; X86-SSE42-NEXT: andl $7, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0
+; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1
+; X86-SSE42-NEXT: movups 32(%esp,%ecx,8), %xmm2
+; X86-SSE42-NEXT: movups 48(%esp,%ecx,8), %xmm3
; X86-SSE42-NEXT: movups %xmm3, 48(%eax)
; X86-SSE42-NEXT: movups %xmm2, 32(%eax)
; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
@@ -2675,7 +24581,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: popl %ebx
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: ashr_64bytes:
+; X86-AVX-LABEL: ashr_64bytes_qwordOff:
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: pushl %ebx
; X86-AVX-NEXT: pushl %edi
@@ -2695,7 +24601,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: vmovups %ymm0, (%esp)
; X86-AVX-NEXT: sarl $31, %edx
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -2714,11 +24620,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andl $63, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm3
+; X86-AVX-NEXT: andl $7, %ecx
+; X86-AVX-NEXT: vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX-NEXT: vmovups 32(%esp,%ecx,8), %xmm2
+; X86-AVX-NEXT: vmovups 48(%esp,%ecx,8), %xmm3
; X86-AVX-NEXT: vmovups %xmm3, 48(%eax)
; X86-AVX-NEXT: vmovups %xmm2, 32(%eax)
; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
@@ -2730,45 +24636,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: vzeroupper
; X86-AVX-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
- %byteOff = load i512, ptr %byteOff.ptr, align 1
- %bitOff = shl i512 %byteOff, 3
+ %qwordOff = load i512, ptr %qwordOff.ptr, align 1
+ %bitOff = shl i512 %qwordOff, 6
%res = ashr i512 %src, %bitOff
store i512 %res, ptr %dst, align 1
ret void
}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
-; FALLBACK0: {{.*}}
-; FALLBACK1: {{.*}}
-; FALLBACK10: {{.*}}
-; FALLBACK11: {{.*}}
-; FALLBACK12: {{.*}}
-; FALLBACK13: {{.*}}
-; FALLBACK14: {{.*}}
-; FALLBACK15: {{.*}}
-; FALLBACK16: {{.*}}
-; FALLBACK17: {{.*}}
-; FALLBACK18: {{.*}}
-; FALLBACK19: {{.*}}
-; FALLBACK2: {{.*}}
-; FALLBACK20: {{.*}}
-; FALLBACK21: {{.*}}
-; FALLBACK22: {{.*}}
-; FALLBACK23: {{.*}}
-; FALLBACK24: {{.*}}
-; FALLBACK25: {{.*}}
-; FALLBACK26: {{.*}}
-; FALLBACK27: {{.*}}
-; FALLBACK28: {{.*}}
-; FALLBACK29: {{.*}}
-; FALLBACK3: {{.*}}
-; FALLBACK30: {{.*}}
-; FALLBACK31: {{.*}}
-; FALLBACK4: {{.*}}
-; FALLBACK5: {{.*}}
-; FALLBACK6: {{.*}}
-; FALLBACK7: {{.*}}
-; FALLBACK8: {{.*}}
-; FALLBACK9: {{.*}}
; X64: {{.*}}
; X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index f84131dfc87970..8c0873492ce402 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -588,61 +588,58 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %ah
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ah, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%eax), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -655,50 +652,39 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $44, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movb (%eax), %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %ah, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %ah, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $44, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -711,51 +697,49 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %bl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -768,47 +752,40 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -899,66 +876,62 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $60, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $60, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -967,58 +940,45 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X86-NO-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %dl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: shl_16bytes:
@@ -1027,34 +987,32 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %al, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
@@ -1072,7 +1030,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1081,57 +1039,45 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X86-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %dl, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edi), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1218,62 +1164,61 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 8(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1286,51 +1231,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $44, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $44, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -1343,52 +1279,52 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1401,48 +1337,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -1459,35 +1390,34 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8,8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8,8), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
@@ -1496,142 +1426,124 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r10,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
@@ -1640,127 +1552,120 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi,4), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1775,95 +1680,67 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ebp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ebp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ebp), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ebp), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ebp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ebp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
@@ -1879,103 +1756,95 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 20(%esp,%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1988,92 +1857,73 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -2089,31 +1939,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: negb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %sil, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: negb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r10), %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r10), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
@@ -2146,79 +1996,70 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT: negb %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movsbq %sil, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $24, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: negb %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movsbq %al, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rsi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r8, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %cl, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -16(%rsp,%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax
@@ -2226,50 +2067,40 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negb %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movsbq %sil, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $24, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movsbq %al, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r8, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
@@ -2278,118 +2109,112 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $28, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: negb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %al, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -2398,7 +2223,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -2413,99 +2238,70 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ebp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ebp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ebp), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ebp), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ebp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ebp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $28, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %al, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
@@ -2519,106 +2315,105 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $88, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edx), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 84(%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 92(%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $88, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -2631,95 +2426,75 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $28, %al
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%esi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%esi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%esi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%esi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -2735,36 +2510,36 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8,8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8,8), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
@@ -2773,145 +2548,130 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r10,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: sarq $63, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rcx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rsi, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
@@ -2920,17 +2680,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx
@@ -2942,7 +2702,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -2953,95 +2713,94 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebp,4), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ebp,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ebx,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ebx,4), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -3088,64 +2847,41 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
@@ -3161,106 +2897,101 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 20(%esp,%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ebx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -3273,93 +3004,79 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -3381,6 +3098,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
@@ -3390,6 +3108,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
@@ -3398,18 +3121,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d
; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi
@@ -3417,7 +3132,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
@@ -3426,7 +3140,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
@@ -3478,6 +3192,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13
@@ -3488,22 +3203,24 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -3511,73 +3228,41 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, (%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rdi), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rdi), %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -3588,6 +3273,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
@@ -3597,6 +3283,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
@@ -3606,52 +3297,43 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d
; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11
@@ -3662,10 +3344,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
@@ -3676,11 +3359,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
@@ -3691,6 +3371,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
@@ -3700,60 +3385,39 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rbx,%rbx), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r13,%r13), %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r15, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r12, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -3762,40 +3426,44 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $208, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3806,8 +3474,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3816,214 +3483,199 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 128(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 132(%esp,%esi), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 136(%esp,%esi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%esp), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 140(%esp,%esi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -4036,209 +3688,153 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 52(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -4252,42 +3848,46 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -4297,6 +3897,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4307,163 +3908,141 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
@@ -4478,7 +4057,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4489,7 +4068,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
@@ -4499,7 +4078,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
@@ -4508,13 +4087,17 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -4522,9 +4105,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -4534,138 +4118,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %ecx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, (%esp) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 48(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 52(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -4680,7 +4216,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13
@@ -4695,6 +4230,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -4703,107 +4243,91 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: negl %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r14), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r14), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rbx), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r14), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rbx), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r14), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rbx), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r14), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r14), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rbx), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r14), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rbx), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r15
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r14), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%rbx), %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rbx), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r13
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 48(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, 56(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
@@ -4815,7 +4339,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
@@ -4823,77 +4352,42 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
; X64-NO-BMI2-HAVE-SHLD-NEXT: negl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r10), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r10), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r10), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r10), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r10), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r10), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%r10), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r12, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 40(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r9), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r9), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r9), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r9), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r9), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes:
@@ -4904,6 +4398,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
@@ -4913,6 +4408,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -4922,68 +4422,58 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r15, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r11, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %bpl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r13b
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r10, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r15, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rbx, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %r11, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r8, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r13, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r12, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 56(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
@@ -4994,12 +4484,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
@@ -5009,6 +4496,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -5018,65 +4510,40 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rdi, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r8, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %rbx, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r12, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r11, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r8), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r8), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r8), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
@@ -5085,42 +4552,44 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $192, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5129,6 +4598,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5137,200 +4609,179 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: subl %ecx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: subl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %ch
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, (%esp) # 1-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: negl %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%ecx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: negl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%edi), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 56(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 48(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 52(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 56(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 52(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 40(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5353,7 +4804,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -5366,213 +4817,153 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%ecx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%ecx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%ecx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %ebp, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 160(%esp,%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -5585,50 +4976,55 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $216, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5641,179 +5037,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, (%esp), %ebx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 212(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 188(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 36(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 28(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 56(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 48(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 40(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 32(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $216, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -5827,43 +5194,43 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%ebx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%ebx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%ebx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%ebx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5872,6 +5239,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5882,148 +5252,95 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %edi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 44(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 56(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %ebx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 176(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
@@ -6045,6 +5362,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
@@ -6072,9 +5390,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d
; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi
@@ -6082,7 +5400,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
@@ -6091,7 +5408,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
@@ -6143,6 +5460,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13
@@ -6153,22 +5471,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -6176,74 +5491,50 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, (%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rdi), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rdi), %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes:
@@ -6254,6 +5545,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
@@ -6281,44 +5573,43 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d
; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11
@@ -6329,10 +5620,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
@@ -6343,11 +5635,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
@@ -6376,52 +5665,39 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rbx,%rbx), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r13,%r13), %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r15, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r12, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
@@ -6430,12 +5706,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
@@ -6443,7 +5719,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
@@ -6452,19 +5728,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6473,7 +5749,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6482,7 +5758,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -6503,196 +5779,195 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 128(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 132(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 136(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%esp), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 140(%esp,%esi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -6705,7 +5980,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6718,7 +5993,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx
@@ -6726,189 +6001,144 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 52(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -6942,199 +6172,199 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
@@ -7149,7 +6379,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -7158,7 +6388,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
@@ -7170,173 +6400,142 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 40(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 32(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 16(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 52(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 9ae1f270e88337..044be12a395433 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -432,30 +432,89 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X86-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %ebx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT: movb %bl, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -505,30 +564,89 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X86-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movw %si, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -577,30 +695,89 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X86-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -649,32 +826,128 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X86-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $32, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 8(%esp,%edx), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx), %edi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: movl %edx, %ebx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-SHLD-NEXT: movl %edi, (%eax)
+; X86-SHLD-NEXT: addl $32, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -689,58 +962,123 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movzbl -64(%rsp,%rax), %eax
-; X64-NEXT: movb %al, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movb %al, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rax
+; X64-BMI2-NEXT: movb %al, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %ebx
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT: movb %bl, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -756,58 +1094,136 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rax
-; X64-NEXT: movw %ax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: addl %eax, %eax
+; X64-NO-BMI2-NEXT: andb $56, %cl
+; X64-NO-BMI2-NEXT: notb %cl
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NEXT: orl %esi, %eax
+; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT: andb $56, %sil
+; X64-BMI2-NEXT: notb %sil
+; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: addl %eax, %eax
+; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movw %cx, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movw %si, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -822,58 +1238,136 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movl -64(%rsp,%rax), %eax
-; X64-NEXT: movl %eax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: addl %eax, %eax
+; X64-NO-BMI2-NEXT: andb $56, %cl
+; X64-NO-BMI2-NEXT: notb %cl
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NEXT: orl %esi, %eax
+; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT: andb $56, %sil
+; X64-BMI2-NEXT: notb %sil
+; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: addl %eax, %eax
+; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movl %ecx, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -888,60 +1382,191 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rax
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movl %ecx, %eax
+; X64-SHLD-NEXT: shrb $6, %al
+; X64-SHLD-NEXT: movzbl %al, %eax
+; X64-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT: shrdq %cl, %rax, %rsi
+; X64-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $64, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 8(%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %edi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: movl %edx, %ebx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-SHLD-NEXT: movl %edi, (%eax)
+; X86-SHLD-NEXT: addl $64, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -956,70 +1581,288 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rcx
-; X64-NEXT: movq -56(%rsp,%rax), %rax
-; X64-NEXT: movq %rax, 8(%rdx)
-; X64-NEXT: movq %rcx, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %esi
-; X86-NEXT: movl 8(%esp,%ecx), %edi
-; X86-NEXT: movl 12(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rsi,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebp
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $92, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movups (%eax), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %eax
+; X86-SHLD-NEXT: shrb $5, %al
+; X86-SHLD-NEXT: movzbl %al, %ebx
+; X86-SHLD-NEXT: movl 24(%esp,%ebx,4), %esi
+; X86-SHLD-NEXT: movl 16(%esp,%ebx,4), %eax
+; X86-SHLD-NEXT: movl 20(%esp,%ebx,4), %edi
+; X86-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT: movl 28(%esp,%ebx,4), %ebp
+; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT: movl 32(%esp,%ebx,4), %ebx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: shrdl %cl, %ebx, %ebp
+; X86-SHLD-NEXT: movl %ebp, 12(%edx)
+; X86-SHLD-NEXT: movl %esi, 8(%edx)
+; X86-SHLD-NEXT: movl %edi, 4(%edx)
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT: movl %eax, (%edx)
+; X86-SHLD-NEXT: addl $92, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: popl %ebp
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -1034,84 +1877,155 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
}
define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movzbl -128(%rsp,%rsi), %eax
-; X64-NEXT: movb %al, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: pushq %rax
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: andl $56, %ecx
+; X64-NO-BMI2-NEXT: andl $56, %esi
+; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT: addl %esi, %esi
+; X64-NO-BMI2-NEXT: notl %ecx
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NEXT: orl %eax, %esi
+; X64-NO-BMI2-NEXT: movb %sil, (%rdx)
+; X64-NO-BMI2-NEXT: popq %rax
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: pushq %rax
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: leal (,%rsi,8), %eax
+; X64-BMI2-NEXT: andl $56, %eax
+; X64-BMI2-NEXT: andl $56, %esi
+; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT: notl %eax
+; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT: addl %esi, %esi
+; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movb %cl, (%rdx)
+; X64-BMI2-NEXT: popq %rax
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: subl $136, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-SHLD-NEXT: andl $60, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %ebx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT: movb %bl, (%eax)
+; X86-SHLD-NEXT: addl $136, %esp
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1127,84 +2041,155 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-NEXT: movw %ax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: pushq %rax
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: andl $56, %ecx
+; X64-NO-BMI2-NEXT: andl $56, %esi
+; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT: addl %esi, %esi
+; X64-NO-BMI2-NEXT: notl %ecx
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NEXT: orl %eax, %esi
+; X64-NO-BMI2-NEXT: movw %si, (%rdx)
+; X64-NO-BMI2-NEXT: popq %rax
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: pushq %rax
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: leal (,%rsi,8), %eax
+; X64-BMI2-NEXT: andl $56, %eax
+; X64-BMI2-NEXT: andl $56, %esi
+; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT: notl %eax
+; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT: addl %esi, %esi
+; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movw %cx, (%rdx)
+; X64-BMI2-NEXT: popq %rax
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $136, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-SHLD-NEXT: andl $60, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movw %si, (%eax)
+; X86-SHLD-NEXT: addl $136, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1219,84 +2204,155 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movl -128(%rsp,%rsi), %eax
-; X64-NEXT: movl %eax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: pushq %rax
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: andl $56, %ecx
+; X64-NO-BMI2-NEXT: andl $56, %esi
+; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT: addl %esi, %esi
+; X64-NO-BMI2-NEXT: notl %ecx
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NEXT: orl %eax, %esi
+; X64-NO-BMI2-NEXT: movl %esi, (%rdx)
+; X64-NO-BMI2-NEXT: popq %rax
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: pushq %rax
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: leal (,%rsi,8), %eax
+; X64-BMI2-NEXT: andl $56, %eax
+; X64-BMI2-NEXT: andl $56, %esi
+; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT: notl %eax
+; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT: addl %esi, %esi
+; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movl %ecx, (%rdx)
+; X64-BMI2-NEXT: popq %rax
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $136, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-SHLD-NEXT: andl $60, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, (%eax)
+; X86-SHLD-NEXT: addl $136, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1311,86 +2367,216 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: pushq %rax
+; X64-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT: andl $56, %esi
+; X64-SHLD-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT: movq %rax, (%rdx)
+; X64-SHLD-NEXT: popq %rax
+; X64-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $128, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %esi
+; X86-SHLD-NEXT: andl $60, %esi
+; X86-SHLD-NEXT: movl 8(%esp,%esi), %edi
+; X86-SHLD-NEXT: movl (%esp,%esi), %edx
+; X86-SHLD-NEXT: movl 4(%esp,%esi), %esi
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: andl $24, %ecx
+; X86-SHLD-NEXT: movl %esi, %ebx
+; X86-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-SHLD-NEXT: movl %edx, (%eax)
+; X86-SHLD-NEXT: addl $128, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $dl killed $dl killed $edx def $edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1405,96 +2591,326 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-NEXT: movq -120(%rsp,%rsi), %rcx
-; X64-NEXT: movq %rcx, 8(%rdx)
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %esi
-; X86-NEXT: movl 8(%esp,%ecx), %edi
-; X86-NEXT: movl 12(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: notl %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r10, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rdi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rax, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $156, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $156, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebp
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $156, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movups (%eax), %xmm0
+; X86-SHLD-NEXT: movups 16(%eax), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %edi
+; X86-SHLD-NEXT: andl $60, %edi
+; X86-SHLD-NEXT: movl 24(%esp,%edi), %esi
+; X86-SHLD-NEXT: movl 16(%esp,%edi), %eax
+; X86-SHLD-NEXT: movl 20(%esp,%edi), %ebx
+; X86-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: andl $24, %ecx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT: movl 28(%esp,%edi), %ebp
+; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT: movl 32(%esp,%edi), %edi
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: shrdl %cl, %edi, %ebp
+; X86-SHLD-NEXT: movl %ebp, 12(%edx)
+; X86-SHLD-NEXT: movl %esi, 8(%edx)
+; X86-SHLD-NEXT: movl %ebx, 4(%edx)
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT: movl %eax, (%edx)
+; X86-SHLD-NEXT: addl $156, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: popl %ebp
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1509,116 +2925,484 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
}
define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-NEXT: movq -120(%rsp,%rsi), %rcx
-; X64-NEXT: movq -112(%rsp,%rsi), %rdi
-; X64-NEXT: movq -104(%rsp,%rsi), %rsi
-; X64-NEXT: movq %rsi, 24(%rdx)
-; X64-NEXT: movq %rdi, 16(%rdx)
-; X64-NEXT: movq %rcx, 8(%rdx)
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $136, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movdqu (%ecx), %xmm0
-; X86-NEXT: movdqu 16(%ecx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %eax
-; X86-NEXT: movl 8(%esp,%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%esp,%eax), %ecx
-; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT: movl 16(%esp,%eax), %esi
-; X86-NEXT: movl 20(%esp,%eax), %edi
-; X86-NEXT: movl 24(%esp,%eax), %ebx
-; X86-NEXT: movl 28(%esp,%eax), %ebp
-; X86-NEXT: movl 32(%esp,%eax), %edx
-; X86-NEXT: movl 36(%esp,%eax), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ecx, 28(%eax)
-; X86-NEXT: movl %edx, 24(%eax)
-; X86-NEXT: movl %ebp, 20(%eax)
-; X86-NEXT: movl %ebx, 16(%eax)
-; X86-NEXT: movl %edi, 12(%eax)
-; X86-NEXT: movl %esi, 8(%eax)
-; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $136, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %r8b
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: notl %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r14, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r10,%r10), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %r11, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r9, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $172, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $172, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebp
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $156, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movups (%eax), %xmm0
+; X86-SHLD-NEXT: movups 16(%eax), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %edi
+; X86-SHLD-NEXT: andl $60, %edi
+; X86-SHLD-NEXT: movl 24(%esp,%edi), %edx
+; X86-SHLD-NEXT: movl 20(%esp,%edi), %eax
+; X86-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: andl $24, %ecx
+; X86-SHLD-NEXT: movl %eax, %esi
+; X86-SHLD-NEXT: movl %edx, %eax
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: movl 28(%esp,%edi), %edx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %eax
+; X86-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: movl 32(%esp,%edi), %ebp
+; X86-SHLD-NEXT: shrdl %cl, %ebp, %edx
+; X86-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-SHLD-NEXT: movl 36(%esp,%edi), %esi
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebp
+; X86-SHLD-NEXT: movl 40(%esp,%edi), %edx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl 44(%esp,%edi), %eax
+; X86-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT: movl 16(%esp,%edi), %ebx
+; X86-SHLD-NEXT: movl 48(%esp,%edi), %edi
+; X86-SHLD-NEXT: shrdl %cl, %edi, %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SHLD-NEXT: movl %eax, 28(%edi)
+; X86-SHLD-NEXT: movl %edx, 24(%edi)
+; X86-SHLD-NEXT: movl %esi, 20(%edi)
+; X86-SHLD-NEXT: movl %ebp, 16(%edi)
+; X86-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-SHLD-NEXT: movl %eax, 12(%edi)
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT: movl %eax, 8(%edi)
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT: movl %eax, 4(%edi)
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT: shrdl %cl, %eax, %ebx
+; X86-SHLD-NEXT: movl %ebx, (%edi)
+; X86-SHLD-NEXT: addl $156, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: popl %ebp
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 24(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1633,9 +3417,9 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
-; X64-HAVE-BMI2-HAVE-SHLD: {{.*}}
-; X64-NO-BMI2-HAVE-SHLD: {{.*}}
+; X64: {{.*}}
; X64-NO-SHLD: {{.*}}
+; X86: {{.*}}
; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
; X86-NO-BMI2-HAVE-SHLD: {{.*}}
; X86-NO-SHLD: {{.*}}
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 4a47e7613dfa6d..ff13f4ba577f2e 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -603,32 +603,86 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movb %sil, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
-; X86-LABEL: load_1byte_chunk_of_16byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %ebx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT: movb %bl, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <16 x i8> %init
@@ -711,32 +765,86 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
-; X86-LABEL: load_2byte_chunk_of_16byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movw %si, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <16 x i8> %init
@@ -818,32 +926,86 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
-; X86-LABEL: load_4byte_chunk_of_16byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <16 x i8> %init
@@ -925,34 +1087,125 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
-; X86-LABEL: load_8byte_chunk_of_16byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $32, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 8(%esp,%edx), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx), %edi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: movl %edx, %ebx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-SHLD-NEXT: movl %edi, (%eax)
+; X86-SHLD-NEXT: addl $32, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <16 x i8> %init
@@ -967,64 +1220,128 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; no @load_16byte_chunk_of_16byte_alloca
define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movzbl -64(%rsp,%rax), %eax
-; X64-NEXT: movb %al, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movb %al, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rax
+; X64-BMI2-NEXT: movb %al, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %ebx
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT: movb %bl, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1038,64 +1355,141 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
}
define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rax
-; X64-NEXT: movw %ax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: addl %eax, %eax
+; X64-NO-BMI2-NEXT: andb $56, %cl
+; X64-NO-BMI2-NEXT: notb %cl
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NEXT: orl %esi, %eax
+; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT: andb $56, %sil
+; X64-BMI2-NEXT: notb %sil
+; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: addl %eax, %eax
+; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movw %cx, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movw %si, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1108,64 +1502,141 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
}
define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movl -64(%rsp,%rax), %eax
-; X64-NEXT: movl %eax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: addl %eax, %eax
+; X64-NO-BMI2-NEXT: andb $56, %cl
+; X64-NO-BMI2-NEXT: notb %cl
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NEXT: orl %esi, %eax
+; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT: andb $56, %sil
+; X64-BMI2-NEXT: notb %sil
+; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: addl %eax, %eax
+; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movl %ecx, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1178,66 +1649,197 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
}
define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rax
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movl %ecx, %eax
+; X64-SHLD-NEXT: shrb $6, %al
+; X64-SHLD-NEXT: movzbl %al, %eax
+; X64-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT: shrdq %cl, %rax, %rsi
+; X64-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $64, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 8(%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %edi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: movl %edx, %ebx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-SHLD-NEXT: movl %edi, (%eax)
+; X86-SHLD-NEXT: addl $64, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1250,76 +1852,295 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
}
define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rcx
-; X64-NEXT: movq -56(%rsp,%rax), %rax
-; X64-NEXT: movq %rax, 8(%rdx)
-; X64-NEXT: movq %rcx, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %esi
-; X86-NEXT: movl 8(%esp,%ecx), %edi
-; X86-NEXT: movl 12(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rsi,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebp
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $92, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movups (%eax), %xmm0
+; X86-SHLD-NEXT: movups 16(%eax), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %eax
+; X86-SHLD-NEXT: shrb $5, %al
+; X86-SHLD-NEXT: movzbl %al, %ebx
+; X86-SHLD-NEXT: movl 24(%esp,%ebx,4), %esi
+; X86-SHLD-NEXT: movl 16(%esp,%ebx,4), %eax
+; X86-SHLD-NEXT: movl 20(%esp,%ebx,4), %edi
+; X86-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT: movl 28(%esp,%ebx,4), %ebp
+; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT: movl 32(%esp,%ebx,4), %ebx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: shrdl %cl, %ebx, %ebp
+; X86-SHLD-NEXT: movl %ebp, 12(%edx)
+; X86-SHLD-NEXT: movl %esi, 8(%edx)
+; X86-SHLD-NEXT: movl %edi, 4(%edx)
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT: movl %eax, (%edx)
+; X86-SHLD-NEXT: addl $92, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: popl %ebp
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1334,7 +2155,7 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
; no @load_32byte_chunk_of_32byte_alloca
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
+; X64: {{.*}}
; X64-NO-SHLD: {{.*}}
-; X64-SHLD: {{.*}}
+; X86: {{.*}}
; X86-NO-SHLD: {{.*}}
-; X86-SHLD: {{.*}}
More information about the llvm-commits
mailing list